From 0813b1ac5c5f1b60a02c9cc9368041b78ca9417a Mon Sep 17 00:00:00 2001 From: Pavel Abramov Date: Tue, 27 Jan 2026 12:32:42 +0100 Subject: [PATCH 01/15] Add BIOS scraping from Redfish Signed-off-by: Pavel Abramov --- conf/config.yaml | 16 +++ main.py | 6 + src/bios_settings.py | 260 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 282 insertions(+) create mode 100644 src/bios_settings.py diff --git a/conf/config.yaml b/conf/config.yaml index a8deb35..0b09393 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -1,6 +1,22 @@ benchmark_output_path: "${hydra:run.dir}/output.csv" sysinfo_collector_file: "${hydra:run.dir}/sysinfo.json" +# --- Configuration for getting BIOS data from redfish + +bios: + enable: true + redfish: + host: "0.0.0.0" + username: "root" + password: "SECRET_PASSWORD" + verify_ssl: false + timeout: 15 + + output: + format: "json" + file: "${hydra:run.dir}/bios.json" + pretty: true + # --- Intel CAT specific configuration --- pqos: diff --git a/main.py b/main.py index 707a839..af8875b 100644 --- a/main.py +++ b/main.py @@ -7,6 +7,8 @@ from omegaconf import DictConfig, OmegaConf +from src.bios_settings import process_bios_settings + from src.metrics import ( CPUmonitor, InterruptMonitor, @@ -93,6 +95,10 @@ def main(cfg: DictConfig): collector.gather_all(cfg) collector.dump_to_file(cfg.sysinfo_collector_file) + # Collect BIOS settings via redfish + if cfg.bios.enable: + process_bios_settings(cfg.bios) + if cfg.demo.demo_mode: print("Running in demo mode. Skipping test execution.") setup_pqos(cfg) diff --git a/src/bios_settings.py b/src/bios_settings.py new file mode 100644 index 0000000..7753cf2 --- /dev/null +++ b/src/bios_settings.py @@ -0,0 +1,260 @@ +import argparse +import urllib3 +import json +import re +import sys + +from omegaconf import DictConfig +from pathlib import Path +from typing import Dict, Optional, Tuple + + +try: + import requests + from requests.auth import HTTPBasicAuth + + REQUESTS_AVAILABLE = True +except ImportError: + print( + "ERROR: requests library not found. Install with: pip install requests", + file=sys.stderr, + ) + sys.exit(1) + +try: + import yaml + + YAML_AVAILABLE = True +except ImportError: + YAML_AVAILABLE = False + +# Disable SSL warnings for self-signed iDRAC certificates +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +import urllib3 + + +def process_bios_settings(cfg: DictConfig): + """ + Fetches BIOS settings using parameters from a Hydra config object. + + Expected Config Structure (example): + redfish: + enabled: true + host: "1.2.3.4" + username: "root" + password: "password" + verify_ssl: false + timeout: 10 + output: + format: "text" # json, yaml, text + file: "bios.json" # or null for stdout + pretty: true + """ + + # 1. Validation & Setup + # Access keys safely. Using .get() allows for defaults if keys are missing in YAML. + # We assume 'redfish' and 'output' are root keys in the passed cfg, + # or you can pass cfg.bios_task to this function. + rf_cfg = cfg.get("redfish", {}) + out_cfg = cfg.get("output", {}) + + if not rf_cfg.get("enabled", True): + print("WARNING: Redfish task is disabled in config", file=sys.stderr) + return + + host = rf_cfg.get("host") + username = rf_cfg.get("username", "root") + password = rf_cfg.get("password") + verify_ssl = rf_cfg.get("verify_ssl", False) + timeout = rf_cfg.get("timeout", 10) + + if not host: + print("ERROR: No host specified in config (redfish.host)", file=sys.stderr) + raise ValueError("Host Missing") + + if not password: + raise ValueError("ERROR: No password specified in config (redfish.password)") + + session, host_url = connect_redfish(host, username, password, verify_ssl, timeout) + if not session: + raise ValueError("Session missing") + + attributes = get_bios_attributes(session, host_url, timeout) + if not attributes: + raise ValueError("Attributes missing") + + output_fmt = out_cfg.get("format", "text") + is_pretty = out_cfg.get("pretty", True) + + if output_fmt == "json": + output_data = format_json(attributes, pretty=is_pretty) + elif output_fmt == "yaml": + output_data = format_yaml(attributes) + else: + output_data = format_text(attributes) + + output_file = out_cfg.get("file") + + if output_file: + try: + with open(output_file, "w") as f: + f.write(output_data) + print(f"✓ Saved to {output_file}", file=sys.stderr) + except Exception as e: + raise ValueError(f"ERROR: Failed to write to {output_file}: {e}") + else: + # Print to stdout + print(output_data) + + +def connect_redfish( + host: str, username: str, password: str, verify_ssl: bool = False, timeout: int = 10 +) -> Tuple[Optional[requests.Session], str]: + """ + Create authenticated Redfish session to iDRAC. + + Returns: + Tuple of (authenticated session or None, normalized host URL) + """ + # Ensure host has https:// prefix + if not host.startswith("http://") and not host.startswith("https://"): + host = f"https://{host}" + + session = requests.Session() + session.auth = HTTPBasicAuth(username, password) + session.verify = verify_ssl + session.headers.update( + {"Content-Type": "application/json", "Accept": "application/json"} + ) + + # Test connection with root endpoint + try: + print(f"Connecting to {host}...", file=sys.stderr) + response = session.get(f"{host}/redfish/v1/", timeout=timeout) + response.raise_for_status() + print(f"✓ Connected successfully", file=sys.stderr) + return session, host + except requests.exceptions.Timeout: + print(f"ERROR: Connection timeout to {host}", file=sys.stderr) + return None, host + except requests.exceptions.ConnectionError as e: + print(f"ERROR: Connection failed to {host}: {e}", file=sys.stderr) + return None, host + except requests.exceptions.HTTPError as e: + print(f"ERROR: HTTP error from {host}: {e}", file=sys.stderr) + return None, host + except Exception as e: + print(f"ERROR: Unexpected error connecting to {host}: {e}", file=sys.stderr) + return None, host + + +def get_bios_attributes( + session: requests.Session, host: str, timeout: int = 10 +) -> Optional[Dict[str, any]]: + """ + Retrieve BIOS attributes from Dell iDRAC. + + Returns: + Dictionary of BIOS attributes or None on failure + """ + try: + # Dell iDRAC standard endpoint for BIOS settings + url = f"{host}/redfish/v1/Systems/System.Embedded.1/Bios" + print(f"Fetching BIOS attributes from {url}...", file=sys.stderr) + + response = session.get(url, timeout=timeout) + response.raise_for_status() + data = response.json() + + # BIOS attributes are in the "Attributes" key + attributes = data.get("Attributes", {}) + print(f"✓ Retrieved {len(attributes)} BIOS attributes", file=sys.stderr) + return attributes + except requests.exceptions.HTTPError as e: + print(f"ERROR: Failed to fetch BIOS attributes: {e}", file=sys.stderr) + print( + f" Response: {e.response.text if e.response else 'No response'}", + file=sys.stderr, + ) + return None + except Exception as e: + print(f"ERROR: Unexpected error fetching BIOS: {e}", file=sys.stderr) + return None + + +def format_text(attributes: Dict[str, any]) -> str: + """ + Format BIOS attributes in human-readable text format. + Groups attributes by prefix for better organization. + """ + if not attributes: + return "No BIOS attributes available" + + lines = [] + lines.append("=" * 80) + lines.append(f"BIOS SETTINGS ({len(attributes)} total attributes)") + lines.append("=" * 80) + lines.append("") + + # Group by prefix (first word before capital letter) + groups = {} + ungrouped = [] + + for key in sorted(attributes.keys()): + value = attributes[key] + # Try to extract prefix (e.g., "Proc" from "ProcTurboMode") + match = re.match(r"^([A-Z][a-z]+)", key) + if match: + prefix = match.group(1) + if prefix not in groups: + groups[prefix] = [] + groups[prefix].append((key, value)) + else: + ungrouped.append((key, value)) + + # Output grouped attributes + for prefix in sorted(groups.keys()): + lines.append(f"[{prefix}*] Settings ({len(groups[prefix])} attributes)") + lines.append("-" * 80) + for key, value in groups[prefix]: + # Format value nicely + if isinstance(value, bool): + value_str = "Enabled" if value else "Disabled" + elif isinstance(value, str) and len(value) > 60: + value_str = value[:57] + "..." + else: + value_str = str(value) + lines.append(f" {key:<40} = {value_str}") + lines.append("") + + # Output ungrouped attributes + if ungrouped: + lines.append(f"[Other] Settings ({len(ungrouped)} attributes)") + lines.append("-" * 80) + for key, value in ungrouped: + if isinstance(value, bool): + value_str = "Enabled" if value else "Disabled" + elif isinstance(value, str) and len(value) > 60: + value_str = value[:57] + "..." + else: + value_str = str(value) + lines.append(f" {key:<40} = {value_str}") + lines.append("") + + return "\n".join(lines) + + +def format_json(attributes: Dict[str, any], pretty: bool = True) -> str: + """Format BIOS attributes as JSON.""" + if pretty: + return json.dumps(attributes, indent=2, sort_keys=True) + else: + return json.dumps(attributes, sort_keys=True) + + +def format_yaml(attributes: Dict[str, any]) -> str: + """Format BIOS attributes as YAML.""" + if not YAML_AVAILABLE: + raise ValueError("ERROR: PyYAML not installed. Cannot output YAML format.") + return yaml.dump(attributes, default_flow_style=False, sort_keys=True) From 19ac81e227a66ce5c82b9a3d61484a2fb7e20481 Mon Sep 17 00:00:00 2001 From: Pavel Abramov Date: Tue, 27 Jan 2026 13:09:01 +0100 Subject: [PATCH 02/15] Update README Signed-off-by: Pavel Abramov --- README.md | 209 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 191 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 245f89d..0384289 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,14 @@ # rtos_bench: benchmarking suite to analyse real-time (RT) performance of an operating system -A lightweight, configurable Python framework for running system and application benchmarks using Hydra for flexible experiment management. -This repository provides a single entry-point script to run various performance tests with reproducible configurations defined in conf/config.yaml. +A comprehensive Python framework for benchmarking, analyzing and validating real-time (RT) performance of operating systems. Combines Docker-containerized benchmarks with statistical analysis tools based on Extreme Value Theory (EVT) to determine if a system meets real-time requirements. + +## Key Features + +- **Containerized Benchmarks**: Run reproducible RT benchmarks (Caterpillar, Cyclictest, iperf3, CODESYS) in Docker or on your host system +- **Intel RDT Integration**: Full support for Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) +- **Statistical RT Validation**: EVT-based analysis with Region of Acceptance (RoA) for probabilistic WCET estimation +- **BIOS Collection via Redfish**: Automatically capture BIOS settings from BMC/iDRAC before benchmarks +- **Jupyter Analysis Notebooks**: Interactive reports for analyzing benchmark results and RT readiness ## Prerequisites: @@ -19,12 +26,32 @@ curl -LsSf https://astral.sh/uv/install.sh | sh uv sync ``` -## How to run benchmark +4. Additional system requirements: + - Docker (for containerized execution) + - `intel-cmt-cat` package (for pqos/Intel RDT support) + - Root access (required for pqos, IRQ affinity, and some metrics) + + + +## Quick Start ```bash -uv run main.py + +# Install dependencies and virtual environment (venv) +uv sync + +# Build all Docker images first +sudo .venv/bin/python3 main.py run.command=build + +# Run a benchmark (e.g. caterpillar) +sudo .venv/bin/python3 main.py run.command=caterpillar + +# Analyze results in Jupyter +uv run jupyter-lab ``` +> **Note**: Benchmarks require root access for pqos, IRQ affinity configuration, and hardware monitoring. Use `sudo .venv/bin/python3/main.py` instead of `uv run main.py` + ## How to run jupyter notebook (analysis software) ``` @@ -56,7 +83,6 @@ After that you can open any report and run it, just double-click on it like here ├── iperf3/ ├── mega-benchmark/ ├── codesys-jitter-benchmark/ -├── data/ # Store experiments here ├── outputs/ # Where we run experiment bundles ├── notebooks/ # Jupyter notebooks to analyse data ├── src/ # libraries @@ -73,24 +99,171 @@ All experiment parameters are controlled via Hydra’s configuration file at: conf/config.yaml ``` -## Example configuration +You can override any configuration parameter from the command line: +```bash +sudo .venv/bin/python3 main.py run.command=cyclictest run.t_core="3,5" ``` + +## Run Configuration + +```yaml run: - command: "caterpillar" - llc_cache_mask: "0x000f" - t_core: "3" - stressor: true - tests_path: "tests" + command: "caterpillar" # Benchmark to run + t_core: "9,11" # Target CPU cores + numa_node: "1" # NUMA node for cpuset-mems (should be same as NUMA node for t_core) + stressor: true # Enable stress workload + metrics: true # Enable metrics monitoring + docker: true # Run inside Docker container + cat_clos_pinning: + enable: true # Pin test PID to CLOS + clos: 1 # CLOS ID to use +``` + +| Parameter | Type | Description | +| ---------------------------- | ------- | -------------------------------------------------------------------------------------------------------------- | +| `run.command` | str | Benchmark to run: `caterpillar`, `cyclictest`, `iperf3`, `mega-benchmark`, `codesys-jitter-benchmark`, `codesys-opcua-pubsub`, or `build`. | +| `run.t_core` | str | Target CPU cores for running the benchmark (e.g., `"3,5,7,9"` or `"9,11"`) | +| `run.numa_node` | str | NUMA node for cpuset-mems (should be same as NUMA node for t_core) | +| `run.stressor` | bool | Enables additional stress workload during the benchmark | +| `run.metrics` | bool | Enable real-time metrics monitoring (CPU temp, IRQs, memory, etc.) | +| `run.docker` | bool | Run benchmark inside Docker container (if `false`, runs on host) | +| `run.cat_clos_pinning.enable`| bool | Enable pinning test PID to specified CLOS (caterpillar/cyclictest only) | +| `run.cat_clos_pinning.clos` | int | CLOS ID to pin the test process to | + +## Intel RDT/CAT Configuration (pqos) + +Configure Intel Resource Director Technology (Cache Allocation Technology, Memory Bandwidth Allocation): + +```yaml +pqos: + interface: "os" # 'os' for resctrl (recommended), 'msr' for direct access + reset_before_apply: true # Reset all allocations before applying new ones + + classes: + - id: 1 + description: "real-time workload" + l3_mask: "0x00ff" # L3 cache mask (8 cache ways) + l2_mask: "0x00ff" # L2 cache mask + mba: 100 # Memory Bandwidth Allocation (%) + pids: [] # PIDs to assign to this class + cores: [] # CPU cores to assign to this class + - id: 0 + description: "background worker" + l3_mask: "0x7f00" # Different cache ways for isolation + l2_mask: "0xff00" + mba: 10 + cores: [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + pids: [115, 118] ``` -| Parameter | Type | Description | -| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------- | -| `command` | str | Benchmark to run. One of: `caterpillar`, `cyclictest`, `iperf3`, `mega-benchmark`, `codesys-jitter-benchmark`. | -| `llc_cache_mask` | str | Hexadecimal mask for Last-Level Cache (LLC) configuration. | -| `t_core` | str/int | Target CPU core for running the benchmark (i.e. '3,5,7,9') | -| `stressor` | bool | Enables additional stress workload during the benchmark. | -| `tests_path` | str | Path to the directory containing benchmark implementations. | +| Parameter | Type | Description | +| -------------------------- | ------ | ------------------------------------------------------------------ | +| `pqos.interface` | str | Interface mode: `os` (resctrl, required for PIDs) or `msr` (direct)| +| `pqos.reset_before_apply` | bool | Reset all allocations before applying new configuration | +| `pqos.classes[].id` | int | Class of Service (CLOS) ID | +| `pqos.classes[].l3_mask` | str | Hexadecimal L3 cache way mask | +| `pqos.classes[].l2_mask` | str | Hexadecimal L2 cache way mask | +| `pqos.classes[].mba` | int | Memory Bandwidth Allocation percentage (10-100) | +| `pqos.classes[].cores` | list | CPU cores to assign to this CLOS leave empty if not used | +| `pqos.classes[].pids` | list | Process IDs to assign to this CLOS leave empty if not used | + + +## IRQ Affinity Configuration + +Configure IRQ and RCU task affinity to isolate real-time cores: + +```yaml +irq_affinity: + enabled: true + housekeeping_cores: "0-1" # Cores for handling IRQs and RCU +``` + +## BIOS Settings Collection via Redfish + +Automatically collect BIOS settings from servers with Redfish-enabled BMC (e.g., Dell iDRAC) before running benchmarks: + +```yaml +bios: + enable: true + redfish: + host: "192.168.1.100" # BMC/iDRAC IP address + username: "root" + password: "YOUR_PASSWORD" + verify_ssl: false # Set to true for valid SSL certificates + timeout: 15 + + output: + format: "json" # Output format: json, yaml, or text + file: "${hydra:run.dir}/bios.json" + pretty: true +``` + +| Parameter | Type | Description | +| ------------------------- | ------ | ------------------------------------------------------------------ | +| `bios.enable` | bool | Enable/disable BIOS settings collection | +| `bios.redfish.host` | str | BMC/iDRAC hostname or IP address | +| `bios.redfish.username` | str | Username for Redfish API authentication | +| `bios.redfish.password` | str | Password for Redfish API authentication | +| `bios.redfish.verify_ssl` | bool | Verify SSL certificates (set `false` for self-signed certs) | +| `bios.redfish.timeout` | int | Connection timeout in seconds | +| `bios.output.format` | str | Output format: `json`, `yaml`, or `text` | +| `bios.output.file` | str | Path to save BIOS settings (supports Hydra interpolation) | +| `bios.output.pretty` | bool | Enable pretty-printing for JSON output | + +## Test-Specific Configuration + +### Caterpillar +```yaml +caterpillar: + n_cycles: 7200 # Number of measurement cycles +``` + +### Cyclictest +```yaml +cyclictest: + loops: 100000 # Number of test loops +``` + +## Metrics Monitoring + +When `run.metrics: true`, the following monitors collect data during benchmark execution: + +| Monitor | Output File | Description | +| ---------------- | ------------------------------ | ---------------------------------------- | +| CPU Monitor | `cpu_monitor.csv` | Per-core CPU temperatures | +| IRQ Monitor | `irq_monitor.csv` | Interrupt counts per CPU | +| MemInfo Monitor | `meminfo_monitor.csv` | Memory statistics from `/proc/meminfo` | +| SoftIRQ Monitor | `softirq_monitor.csv` | Software interrupt statistics | +| CPUStat Monitor | `cpustat_monitor.csv` | CPU usage statistics | +| PQOS Monitor | `pqos_monitor.csv` | Intel RDT monitoring data | + +Configure monitoring intervals in the config: + +```yaml +cpu_monitor: + path: "${hydra:run.dir}/cpu_monitor.csv" + interval: 1.0 +``` + +## Output Files + +Each benchmark run creates a timestamped directory in `outputs/` containing: + +- `output.csv` - Benchmark results +- `sysinfo.json` - System information snapshot (includes Hydra configuration) +- `bios.json` - BIOS settings (if enabled) +- `*_monitor.csv` - Various metrics logs (if enabled) +- `.hydra/` - Hydra configuration logs + +## Security Note +⚠️ **Important**: The Redfish password is stored in the configuration file. Consider: +- Using environment variables for sensitive credentials +- Restricting file permissions on `config.yaml` +- Not committing passwords to version control +## References +- [Dealing with Uncertainty in pWCET Estimations](https://dl.acm.org/doi/abs/10.1145/3396234) - Region of Acceptance methodology +- [Probabilistic-WCET Reliability](https://dl.acm.org/doi/10.1145/3126495) - EVT hypothesis validation From d98406754d2e9a7fe8a571471694611b13f93f6b Mon Sep 17 00:00:00 2001 From: Pavel Abramov Date: Wed, 11 Feb 2026 11:58:56 +0100 Subject: [PATCH 03/15] Add reboot testing feature Signed-off-by: Pavel Abramov --- conf/config.yaml | 1 + main.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/conf/config.yaml b/conf/config.yaml index 0b09393..df8f960 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -56,6 +56,7 @@ irq_affinity: run: command: "caterpillar" t_core: "9,11" + max_count: 10 numa_node: "1" stressor: true # tests_path: "tests" diff --git a/main.py b/main.py index af8875b..40bcbf0 100644 --- a/main.py +++ b/main.py @@ -89,8 +89,7 @@ def setup_metrics(cfg: DictConfig) -> None: pqos_monitor.start() -@hydra.main(version_base=None, config_path="conf", config_name="config") -def main(cfg: DictConfig): +def run_test(cfg: DictConfig): collector = SystemInfoCollector() collector.gather_all(cfg) collector.dump_to_file(cfg.sysinfo_collector_file) @@ -137,5 +136,70 @@ def main(cfg: DictConfig): return runner.run_test(cfg.run.command, cfg.run.t_core, cfg.run.stressor) +@hydra.main(version_base=None, config_path="conf", config_name="config") +def main(cfg: DictConfig): + execution_dir = os.getcwd() + counter_file = "/var/tmp/rt_tools_cur_count.txt" + service_name = "program-reboot.service" + service_path = f"/etc/systemd/system/{service_name}" + max_count = cfg.run.max_count + + if max_count <= 1: + print("max_count <=1. Running once and exiting.") + run_test(cfg) + sys.exit(0) + + cur_count = 0 + if os.path.exists(counter_file): + with open(counter_file, "r") as f: + cur_count = int(f.read().strip()) + else: + cur_count = 0 + + if cur_count == 0: + # First run: Setup systemd + print("First run (cur=0). Creating systemd service...") + service_content = f"""[Unit] +Description=Auto-run main.py on boot +After=network.target + +[Service] +Type=oneshot +User={os.getenv('USER')} +WorkingDirectory={execution_dir} +ExecStart=sudo ./env/python3 main.py +RemainAfterExit=no +Restart=no + +[Install] +WantedBy=multi-user.target +""" + with open(service_path, "w") as f: + f.write(service_content) + subprocess.run(["sudo", "systemctl", "daemon-reload"]) + subprocess.run(["sudo", "systemctl", "enable", service_name]) + + print(f"Run {cur_count + 1}/{max_count}") + run_test(cfg) + + # Increment and check + cur_count += 1 + with open(counter_file, "w") as f: + f.write(str(cur_count)) + + if cur_count >= max_count: + print("Max count reached. Cleaning up and exiting.") + if os.path.exists(service_path): + subprocess.run(["sudo", "systemctl", "stop", service_name], check=False) + subprocess.run(["sudo", "systemctl", "disable", service_name], check=False) + os.remove(service_path) + subprocess.run(["sudo", "systemctl", "daemon-reload"]) + os.remove(counter_file) + sys.exit(0) + else: + print("Rebooting for next run...") + os.system("sudo reboot") + + if __name__ == "__main__": main() From d9110acb4dfb360b510de2056bf1b1a325fc04d0 Mon Sep 17 00:00:00 2001 From: Pavel Abramov Date: Fri, 13 Feb 2026 17:27:36 +0100 Subject: [PATCH 04/15] Make PQOS optional Signed-off-by: Pavel Abramov --- conf/config.yaml | 1 + main.py | 6 ++++-- src/test_runner.py | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/conf/config.yaml b/conf/config.yaml index df8f960..48d7e0d 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -24,6 +24,7 @@ pqos: # 'msr' = Direct hardware access (Cores only, if some features are # not upstream kernel interface: "os" + enable: true # resets all allocations before applying new ones reset_before_apply: true diff --git a/main.py b/main.py index 40bcbf0..65902f4 100644 --- a/main.py +++ b/main.py @@ -100,7 +100,8 @@ def run_test(cfg: DictConfig): if cfg.demo.demo_mode: print("Running in demo mode. Skipping test execution.") - setup_pqos(cfg) + if cfg.pqos.enable: + setup_pqos(cfg) runner = DockerHDE2E(cfg) print("Starting demo HDE2E test...") print("Starting IO...") @@ -121,7 +122,8 @@ def run_test(cfg: DictConfig): if cfg.run.command == "build": return runner.build() - setup_pqos(cfg) + if cfg.pqos.enable: + setup_pqos(cfg) # Handle test commands if cfg.run.command not in runner.tests: diff --git a/src/test_runner.py b/src/test_runner.py index 8d377c0..04468ff 100644 --- a/src/test_runner.py +++ b/src/test_runner.py @@ -276,7 +276,7 @@ def _run_cyclictest( cyclictest_cmd = ( # "chrt -r 95 " f"/usr/bin/cyclictest --threads -t 1 -p 95 " - f"-l {cycles} -d 1 -D 0 -i 100000 -a {t_core}" + f"-l {cycles} -d 1 -D 0 -i {self.config.caterpillar.n_cycles} -a {t_core}" ) if self.config.run.docker: rdtset_cmd = f"stdbuf -oL -eL " f"{cyclictest_cmd}" @@ -547,7 +547,7 @@ def _run_interactive_command(self, cmd: List[str]) -> Popen[str]: bufsize=1, ) - if self.config.run.cat_clos_pinning.enable: + if self.config.run.cat_clos_pinning.enable and self.config.pqos.enable: MAX_RETRIES = 5 SLEEP_TIME = 2 # Seconds to wait between tries From 8066001a55333822064b89e011b8271a4d82f49c Mon Sep 17 00:00:00 2001 From: Pavel Abramov Date: Fri, 13 Feb 2026 17:53:03 +0100 Subject: [PATCH 05/15] Fix local run Signed-off-by: Pavel Abramov --- main.py | 5 +++-- src/test_runner.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 65902f4..874cf2b 100644 --- a/main.py +++ b/main.py @@ -79,14 +79,15 @@ def setup_metrics(cfg: DictConfig) -> None: cpustat_monitor = CpuStatMonitor( cfg.cpustat_monitor.path, cfg.softirq_monitor.interval ) - pqos_monitor = PQOSMonitor(cfg.pqos_monitor.path, cfg.pqos_monitor.interval) + if cfg.pqos.enable: + pqos_monitor = PQOSMonitor(cfg.pqos_monitor.path, cfg.pqos_monitor.interval) + pqos_monitor.start() cpu_monitor.start() interrupt_monitor.start() meminfo_monitor.start() softirq_monitor.start() cpustat_monitor.start() - pqos_monitor.start() def run_test(cfg: DictConfig): diff --git a/src/test_runner.py b/src/test_runner.py index 04468ff..f520604 100644 --- a/src/test_runner.py +++ b/src/test_runner.py @@ -3,6 +3,7 @@ import subprocess import psutil import time +import shlex from typing import List, Optional from omegaconf import DictConfig, OmegaConf @@ -237,7 +238,7 @@ def _run_caterpillar(self, base_cmd: List[str], t_core: str, path: str) -> int: rdtset_cmd, ] else: - cmd = [caterpillar_cmd] + cmd = shlex.split(caterpillar_cmd) print(" ".join(cmd)) try: @@ -287,7 +288,7 @@ def _run_cyclictest( rdtset_cmd, ] else: - cmd = [cyclictest_cmd] + cmd = shlex.split(cyclictest_cmd) print(" ".join(cmd)) From 766e4ead9e55d4d781b7cda09d4c33bee48eff49 Mon Sep 17 00:00:00 2001 From: Pavel Abramov Date: Fri, 13 Feb 2026 18:03:40 +0100 Subject: [PATCH 06/15] Add dockerfile Signed-off-by: Pavel Abramov --- Dockerfile | 41 +++++++++++++++++++++++++++++++++++++++++ conf/config.yaml | 2 +- 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..192dc54 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,41 @@ +FROM eci-base:latest + +# Add ECI repository key and sources +RUN wget -O- https://eci.intel.com/repos/gpg-keys/GPG-PUB-KEY-INTEL-ECI.gpg | tee /usr/share/keyrings/eci-archive-keyring.gpg > /dev/null + +RUN . /etc/os-release && \ + echo "deb [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/${VERSION_CODENAME} isar main" | tee /etc/apt/sources.list.d/eci.list && \ + echo "deb-src [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/${VERSION_CODENAME} isar main" | tee -a /etc/apt/sources.list.d/eci.list + +RUN bash -c 'echo -e "Package: *\nPin: origin eci.intel.com\nPin-Priority: 1000" > /etc/apt/preferences.d/isar' && \ + bash -c 'echo -e "\nPackage: libflann*\nPin: version 1.19.*\nPin-Priority: -1\n\nPackage: flann*\nPin: version 1.19.*\nPin-Priority: -1" >> /etc/apt/preferences.d/isar' + +RUN apt-get update && apt-get install -y --no-install-recommends \ + eci-realtime-benchmarking \ + rt-tests \ + intel-cmt-cat \ + && rm -rf /var/lib/apt/lists/* + +ENV USER=root +RUN chmod +x /opt/benchmarking/caterpillar/caterpillar + +# Install uv package manager +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +WORKDIR /app + +# Copy project files +COPY pyproject.toml uv.lock* ./ +COPY src/ ./src/ +COPY conf/ ./conf/ +COPY main.py ./ + +# Install Python 3.12 and sync dependencies +RUN uv python install 3.12 && uv sync + +# Default: run main.py with docker=false so benchmarks execute natively +ENTRYPOINT ["uv", "run", "python", "main.py", "run.docker=false", "pqos.enable=false", "run.stressor=false"] +# Override command to select benchmark, e.g.: +# docker run ... rt-tools-main:latest run.command=caterpillar +# docker run ... rt-tools-main:latest run.command=cyclictest diff --git a/conf/config.yaml b/conf/config.yaml index 48d7e0d..2bdeb0c 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -57,7 +57,7 @@ irq_affinity: run: command: "caterpillar" t_core: "9,11" - max_count: 10 + max_count: 1 numa_node: "1" stressor: true # tests_path: "tests" From cd28d419813bb7ae58f3c94da5b1b5eb62e60a65 Mon Sep 17 00:00:00 2001 From: Pavel Abramov Date: Sun, 15 Feb 2026 11:38:47 +0100 Subject: [PATCH 07/15] Locally deployed container Signed-off-by: Pavel Abramov --- Dockerfile | 1 + conf/config.yaml | 14 +++++++------- src/test_runner.py | 5 ++++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 192dc54..9717448 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ eci-realtime-benchmarking \ rt-tests \ intel-cmt-cat \ + git \ && rm -rf /var/lib/apt/lists/* ENV USER=root diff --git a/conf/config.yaml b/conf/config.yaml index 2bdeb0c..db88527 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -4,7 +4,7 @@ sysinfo_collector_file: "${hydra:run.dir}/sysinfo.json" # --- Configuration for getting BIOS data from redfish bios: - enable: true + enable: false redfish: host: "0.0.0.0" username: "root" @@ -24,7 +24,7 @@ pqos: # 'msr' = Direct hardware access (Cores only, if some features are # not upstream kernel interface: "os" - enable: true + enable: false # resets all allocations before applying new ones reset_before_apply: true @@ -50,7 +50,7 @@ pqos: pids: [115, 118] irq_affinity: - enabled: true + enabled: false housekeeping_cores: "0-1" # --- Configuration related to tests running and parameters passed to it @@ -59,12 +59,12 @@ run: t_core: "9,11" max_count: 1 numa_node: "1" - stressor: true + stressor: false # tests_path: "tests" - metrics: true # enable metrics monitoring - docker: true # run inside docker container, if false will run on host system + metrics: false # enable metrics monitoring + docker: false # run inside docker container, if false will run on host system cat_clos_pinning: - enable: true # pins test PID to CLOS specified below works only for caterpillar and cyclictest + enable: false # pins test PID to CLOS specified below works only for caterpillar and cyclictest clos: 1 # --- High density End to End demo --- diff --git a/src/test_runner.py b/src/test_runner.py index f520604..9c53d19 100644 --- a/src/test_runner.py +++ b/src/test_runner.py @@ -264,7 +264,10 @@ def _run_caterpillar(self, base_cmd: List[str], t_core: str, path: str) -> int: process.terminate() finally: try: - subprocess.run("docker stop $(docker ps -q)", shell=True, check=False) + if self.config.run.docker: + subprocess.run( + "docker stop $(docker ps -q)", shell=True, check=False + ) except Exception as e: print(f"Error stopping containers: {e}") From cb9ce86feb8ee0ab3acdf08ca2c6a5d9b78ac9a6 Mon Sep 17 00:00:00 2001 From: Pavel Abramov Date: Mon, 16 Feb 2026 19:54:18 +0100 Subject: [PATCH 08/15] Add detect CPUs Signed-off-by: Pavel Abramov --- main.py | 7 +++++- src/detect_cpus.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 src/detect_cpus.py diff --git a/main.py b/main.py index 874cf2b..161f18d 100644 --- a/main.py +++ b/main.py @@ -22,6 +22,7 @@ from src.irq_affinity import set_irq_affinity from src.test_runner import DockerTestRunner from src.hde2e import DockerHDE2E +from scr.detect_cpus import detect_cpus def setup_pqos(cfg: DictConfig) -> None: @@ -136,7 +137,11 @@ def run_test(cfg: DictConfig): if cfg.irq_affinity.enabled: set_irq_affinity(cfg.irq_affinity.housekeeping_cores) - return runner.run_test(cfg.run.command, cfg.run.t_core, cfg.run.stressor) + cores = detect_cpus() + if cores == "": + cores = cfg.run.t_core + + return runner.run_test(cfg.run.command, cores, cfg.run.stressor) @hydra.main(version_base=None, config_path="conf", config_name="config") diff --git a/src/detect_cpus.py b/src/detect_cpus.py new file mode 100644 index 0000000..9f02812 --- /dev/null +++ b/src/detect_cpus.py @@ -0,0 +1,56 @@ +from pathlib import Path +import os + + +def detect_cpus() -> str: + cpus = ( + _from_cgroup_v2() + or _from_proc_stat() + or _from_proc_cpuinfo() + or _from_sysconf() + ) + return str(cpus) if cpus else "" + + +def _from_cgroup_v2() -> int | None: + p = Path("/sys/fs/cgroup/cpuset.cpus.effective") + if not p.is_file(): + return None + total = 0 + for part in p.read_text().strip().split(","): + if "-" in part: + lo, hi = part.split("-", 1) + total += int(hi) - int(lo) + 1 + else: + total += 1 + return total or None + + +def _from_proc_stat() -> int | None: + p = Path("/proc/stat") + if not p.is_file(): + return None + count = sum( + 1 + for line in p.read_text().splitlines() + if line.startswith("cpu") and line[3:4].isdigit() + ) + return count or None + + +def _from_proc_cpuinfo() -> int | None: + p = Path("/proc/cpuinfo") + if not p.is_file(): + return None + count = sum( + 1 for line in p.read_text().splitlines() if line.startswith("processor") + ) + return count or None + + +def _from_sysconf() -> int | None: + n = os.sysconf("SC_NPROCESSORS_ONLN") if hasattr(os, "sysconf") else 0 + if n > 0: + return n + n = os.sysconf("SC_NPROCESSORS_CONF") if hasattr(os, "sysconf") else 0 + return n if n > 0 else None From c928549ecbc0be452c62d198c0b4f36e9283fadb Mon Sep 17 00:00:00 2001 From: Pavel Abramov Date: Mon, 16 Feb 2026 19:55:26 +0100 Subject: [PATCH 09/15] No L2 in default config --- conf/config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/config.yaml b/conf/config.yaml index db88527..da69bf4 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -34,7 +34,7 @@ pqos: description: "real-time workload" # Hexademical mask (e.g 0x003 means using the last 2 cache ways) l3_mask: "0x00ff" # comment if none - l2_mask: "0x00ff" + #l2_mask: "0x00ff" mba: 100 # set PIDs to assign to CLOS leave empty if none pids : [] @@ -44,7 +44,7 @@ pqos: description: "background worker" # Using different cache ways to isolate from class 1 l3_mask: "0x7f00" - l2_mask: "0xff00" + #l2_mask: "0xff00" mba: 10 cores: [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] pids: [115, 118] From c6d2481e6be9504f8d6f3b4b16efd0709ebb3421 Mon Sep 17 00:00:00 2001 From: Mikhail Malyshev Date: Tue, 17 Feb 2026 04:57:16 +0100 Subject: [PATCH 10/15] standalone EVE OS containers with RT preflight, SSH, and Jupyter Restructure caterpillar and cyclictest into self-contained containers for deployment on EVE OS with full RT benchmarking support. Build & deploy: - Add build-all.sh with TAG= and optional registry push support - SSH_KEY build arg for key-only auth (defaults to ~/.ssh/ztest_key.pub) - BASE_TAG build arg to pin child images to versioned base - Print build summary with full FQDN image tags Base image (Dockerfile.base): - Add bash, git, procps, ncurses-term, openssh-server - Configure sshd with pubkey auth, start at container boot - Add login banner (motd) with Jupyter/SSH/CLI instructions - Add shell aliases: jupyter-start, rt-preflight, rt-info - Expose ports 22 (SSH) and 8888 (Jupyter) - Keep container alive after benchmark via sshd foreground RT preflight checks (src/rt_preflight.py): - 14-point validation: PREEMPT_RT, isolcpus, nohz_full, rcu_nocbs, irqaffinity, C-states, intel_pstate, governor, clocksource, NUMA balancing, split_lock, hugepages, capabilities, kernel threads - Detects cmdline typos (e.g. rocessor.max_cstate) - PASS/WARN/FAIL output visible in EVE OS cloud log viewer Container improvements: - Copy Python code, config, and notebooks into child images - Add run.interactive flag: tqdm in terminal, brief logs in containers - Fix detect_cpus to return actual core list from cgroup, not count - Fix cyclictest: remove redundant chrt (handled by -p 95) - Remove rdtset references, skip nested docker when run.docker=false - Fix typo in main.py import (scr -> src) Signed-off-by: Mikhail Malyshev --- Dockerfile.base | 183 ++++++++- build-all.sh | 101 +++++ caterpillar/Dockerfile | 29 +- conf/config.yaml | 11 +- cyclictest/Dockerfile | 31 +- main.py | 21 +- src/detect_cpus.py | 129 ++++--- src/rt_preflight.py | 815 +++++++++++++++++++++++++++++++++++++++++ src/test_runner.py | 75 ++-- 9 files changed, 1294 insertions(+), 101 deletions(-) create mode 100755 build-all.sh create mode 100644 src/rt_preflight.py diff --git a/Dockerfile.base b/Dockerfile.base index 8cb97e6..0a1fd23 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -1,20 +1,187 @@ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ + bash \ + bash-completion \ ca-certificates \ curl \ + git \ gnupg \ + less \ lsb-release \ - wget + ncurses-base \ + ncurses-term \ + openssh-server \ + procps \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# Shell environment +ENV SHELL=/bin/bash +ENV TERM=xterm-256color +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +# Login banner (motd) +RUN cat > /etc/motd <<'MOTD' + + ╔═════════════════════════════════════════════════════════════╗ + ║ RT Benchmarking Container ║ + ╠═════════════════════════════════════════════════════════════╣ + ║ ║ + ║ BENCHMARKS ║ + ║ Run benchmark: cd /app && uv run python main.py \ ║ + ║ run.docker=false pqos.enable=false ║ + ║ Pre-flight: cd /app && uv run python -m \ ║ + ║ src.rt_preflight ║ + ║ ║ + ║ JUPYTER NOTEBOOK ║ + ║ Start: jupyter-start ║ + ║ Then open: http://:8888 ║ + ║ ║ + ║ RESULTS ║ + ║ Output dir: /app/outputs/ ║ + ║ Notebooks: /app/notebook/ ║ + ║ SCP results: scp -i root@:/app/outputs . ║ + ║ ║ + ║ USEFUL COMMANDS ║ + ║ rt-preflight Run RT environment checks ║ + ║ rt-info Show detected CPUs and kernel info ║ + ║ ll List files (long format) ║ + ║ ║ + ╚═════════════════════════════════════════════════════════════╝ + +MOTD + +# Minimal but usable bashrc +RUN cat > /root/.bashrc <<'EOF' +# prompt +PS1='\[\e[1;32m\]\u@\h\[\e[0m\]:\[\e[1;34m\]\w\[\e[0m\]\$ ' + +# history +HISTSIZE=1000 +HISTFILESIZE=2000 +HISTCONTROL=ignoreboth +shopt -s histappend + +# usability +shopt -s checkwinsize + +# color ls +alias ls='ls --color=auto' +alias ll='ls -alF' +alias la='ls -A' +alias l='ls -CF' + +# RT tools aliases +jupyter-start() { + local ip + ip=$(hostname -I 2>/dev/null | awk '{print $1}') + ip=${ip:-0.0.0.0} + echo "" + echo " Starting Jupyter Notebook..." + echo " Connect at: http://${ip}:8888" + echo "" + cd /app && uv run jupyter notebook \ + --ip=0.0.0.0 \ + --port=8888 \ + --no-browser \ + --allow-root \ + --notebook-dir=/app/notebook \ + --ServerApp.custom_display_url="http://${ip}:8888" +} +alias rt-preflight='cd /app && uv run python -m src.rt_preflight' +alias rt-info='echo "=== Kernel ===" && uname -a && echo && echo "=== CPUs (cgroup) ===" && cat /sys/fs/cgroup/cpuset.cpus.effective 2>/dev/null || cat /sys/fs/cgroup/cpuset/cpuset.cpus 2>/dev/null || echo "N/A" && echo && echo "=== RT cmdline params ===" && cat /proc/cmdline | tr " " "\n" | grep -E "isolcpus|nohz|rcu_nocbs|irqaffinity|cstate|pstate|hugepages"' + +# bash completion +if [ -f /usr/share/bash-completion/bash_completion ]; then + . /usr/share/bash-completion/bash_completion +fi + +# show motd on interactive login +if [ -f /etc/motd ] && [ -t 0 ]; then + cat /etc/motd +fi +EOF + +RUN cp /root/.bashrc /etc/skel/.bashrc + +# SSH server configuration +ARG SSH_KEY +RUN mkdir -p /run/sshd /root/.ssh && \ + chmod 700 /root/.ssh && \ + ssh-keygen -A && \ + sed -i 's/^#\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config && \ + sed -i 's/^#\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config && \ + sed -i 's/^#\?PubkeyAuthentication.*/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \ + sed -i 's/^#\?UsePAM.*/UsePAM no/' /etc/ssh/sshd_config && \ + echo "${SSH_KEY}" > /root/.ssh/authorized_keys && \ + chmod 600 /root/.ssh/authorized_keys + +EXPOSE 22 8888 + +# Entrypoint script: start sshd, run CMD, keep container alive +RUN cat > /entrypoint.sh <<'ENTRY' +#!/bin/bash +mkdir -p /run/sshd +echo "Starting SSH server..." +/usr/sbin/sshd -e || echo "WARNING: sshd failed to start (exit code $?)" + +IP=$(hostname -I 2>/dev/null | awk '{print $1}') +IP=${IP:-} + +echo "" +echo "============================================================" +echo " RT Benchmarking Container" +echo "============================================================" +echo "" +echo " SSH: ssh -i root@${IP}" +echo " Jupyter: ssh in, then run: jupyter-start" +echo " or from this console:" +echo " cd /app && uv run jupyter notebook --ip=0.0.0.0 \\" +echo " --port=8888 --no-browser --allow-root \\" +echo " --notebook-dir=/app/notebook" +echo " then open: http://${IP}:8888" +echo "" +echo " Results: /app/outputs/" +echo " Notebooks: /app/notebook/" +echo "============================================================" +echo "" + +if [ $# -gt 0 ]; then + echo "Running: $@" + "$@" + EXIT_CODE=$? + echo "" + echo "============================================================" + echo " Command finished with exit code ${EXIT_CODE}" + echo "" + echo " Jupyter: http://${IP}:8888" + echo " To start: ssh root@${IP} then run: jupyter-start" + echo " SSH: ssh -i root@${IP}" + echo " Results: /app/outputs/" + echo "============================================================" + echo "" +fi + +echo "Container staying alive (sshd running). SSH in or ctrl-c to stop." +exec /usr/sbin/sshd -D +ENTRY +RUN chmod +x /entrypoint.sh + +# Intel and ECI repository keys and sources RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null RUN wget -O- https://eci.intel.com/repos/gpg-keys/GPG-PUB-KEY-INTEL-ECI.gpg | tee /usr/share/keyrings/eci-archive-keyring.gpg > /dev/null RUN wget -O- https://raw.githubusercontent.com/ros/rosdistro/master/ros.key | tee /usr/share/keyrings/ros-archive-keyring.gpg > /dev/null RUN . /etc/os-release \ -&& echo $VERSION_CODENAME && \ -bash -c 'echo "deb [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/$(source /etc/os-release && echo $VERSION_CODENAME) isar main" | tee /etc/apt/sources.list.d/eci.list' && \ -bash -c 'echo "deb-src [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/$(source /etc/os-release && echo $VERSION_CODENAME) isar main" | tee -a /etc/apt/sources.list.d/eci.list' && \ -bash -c 'echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list' && \ -bash -c 'echo -e "Package: intel-oneapi-runtime-*\nPin: version 2024.1.*\nPin-Priority: 1001" > /etc/apt/preferences.d/oneapi' && \ -bash -c 'echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/openvino/2024 ubuntu24 main" > /etc/apt/sources.list.d/intel-openvino-2024.list' && \ -bash -c 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/ros-archive-keyring.gpg] http://packages.ros.org/ros2/ubuntu $(source /etc/os-release && echo $VERSION_CODENAME) main" | tee /etc/apt/sources.list.d/ros2.list' + && echo $VERSION_CODENAME && \ + bash -c 'echo "deb [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/$(source /etc/os-release && echo $VERSION_CODENAME) isar main" | tee /etc/apt/sources.list.d/eci.list' && \ + bash -c 'echo "deb-src [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/$(source /etc/os-release && echo $VERSION_CODENAME) isar main" | tee -a /etc/apt/sources.list.d/eci.list' && \ + bash -c 'echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list' && \ + bash -c 'echo -e "Package: intel-oneapi-runtime-*\nPin: version 2024.1.*\nPin-Priority: 1001" > /etc/apt/preferences.d/oneapi' && \ + bash -c 'echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/openvino/2024 ubuntu24 main" > /etc/apt/sources.list.d/intel-openvino-2024.list' && \ + bash -c 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/ros-archive-keyring.gpg] http://packages.ros.org/ros2/ubuntu $(source /etc/os-release && echo $VERSION_CODENAME) main" | tee /etc/apt/sources.list.d/ros2.list' RUN apt-get update && apt-get install -y intel-cmt-cat + +ENTRYPOINT ["/entrypoint.sh"] +CMD ["/bin/bash", "-l"] diff --git a/build-all.sh b/build-all.sh new file mode 100755 index 0000000..d0bc4cb --- /dev/null +++ b/build-all.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +set -euo pipefail + +REGISTRY="${1:-}" +TAG="${TAG:?TAG env var is required (e.g. TAG=1.0.7 $0 [registry])}" +SSH_KEY="${SSH_KEY:-$HOME/.ssh/ztest_key.pub}" + +if [ ! -f "$SSH_KEY" ]; then + echo "ERROR: SSH public key not found at ${SSH_KEY}" + echo "Set SSH_KEY to the path of your public key file" + exit 1 +fi + +SSH_KEY_CONTENT="$(cat "$SSH_KEY")" +echo "Using SSH public key: ${SSH_KEY}" + +IMAGES=( + "eci-base:Dockerfile.base:." + "caterpillar:caterpillar/Dockerfile:." + "cyclictest:cyclictest/Dockerfile:." +) + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Resolve FQDN prefix: always include registry host, even for local builds +if [ -n "$REGISTRY" ]; then + FQDN_PREFIX="${REGISTRY}" +else + FQDN_PREFIX="docker.io/library" +fi + +BUILT_TAGS=() + +for entry in "${IMAGES[@]}"; do + IFS=: read -r name dockerfile context <<< "$entry" + local_tag="${name}:${TAG}" + + echo "===> Building ${local_tag} from ${dockerfile} (context: ${context})" + docker build --network=host \ + --build-arg BASE_TAG="${TAG}" \ + --build-arg SSH_KEY="${SSH_KEY_CONTENT}" \ + -f "$dockerfile" \ + -t "$local_tag" \ + "$context" + + BUILT_TAGS+=("${FQDN_PREFIX}/${name}:${TAG}") + + if [ -n "$REGISTRY" ]; then + remote_tag="${REGISTRY}/${name}:${TAG}" + remote_latest="${REGISTRY}/${name}:latest" + + docker tag "$local_tag" "$remote_tag" + docker tag "$local_tag" "$remote_latest" + + echo "===> Pushing ${remote_tag}" + docker push "$remote_tag" + + echo "===> Pushing ${remote_latest}" + docker push "$remote_latest" + + BUILT_TAGS+=("${REGISTRY}/${name}:latest") + fi + + echo "" +done + +echo "" +echo "BUILD SUMMARY" +echo "" + +for entry in "${IMAGES[@]}"; do + IFS=: read -r name dockerfile context <<< "$entry" + local_tag="${name}:${TAG}" + size=$(docker image inspect "$local_tag" --format='{{.Size}}' 2>/dev/null || echo "0") + size_mb=$(( size / 1024 / 1024 )) + printf " %-50s %4s MB\n" "${FQDN_PREFIX}/${name}:${TAG}" "$size_mb" + if [ -n "$REGISTRY" ]; then + printf " %-50s %4s MB\n" "${REGISTRY}/${name}:latest" "$size_mb" + fi +done + +echo "" +echo "QUICK START" +echo " SSH: ssh -i root@" +echo " Jupyter: ssh in, then run: jupyter-start" +echo " Preflight: ssh in, then run: rt-preflight" +echo "" + +echo "ALL TAGS" +for t in "${BUILT_TAGS[@]}"; do + echo " $t" +done +echo "" + +if [ -n "$REGISTRY" ]; then + echo "Status: built and pushed to ${REGISTRY}" +else + echo "Status: built locally (no registry, push skipped)" +fi +echo "" diff --git a/caterpillar/Dockerfile b/caterpillar/Dockerfile index 4696add..67abd0f 100644 --- a/caterpillar/Dockerfile +++ b/caterpillar/Dockerfile @@ -13,17 +13,34 @@ # License. # -FROM eci-base:latest -# Install required dependencies +ARG BASE_TAG=latest +FROM eci-base:${BASE_TAG} + +# Install caterpillar benchmark RUN apt-get update && apt-get install -y --no-install-recommends \ - caterpillar \ - && rm -rf /var/lib/apt/lists/* + caterpillar \ + && rm -rf /var/lib/apt/lists/* ENV USER=root RUN chmod +x /opt/benchmarking/caterpillar/caterpillar -WORKDIR /opt/benchmarking/caterpillar +# Install uv package manager +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +WORKDIR /app + +# Copy project files +COPY pyproject.toml uv.lock* ./ +COPY src/ ./src/ +COPY conf/ ./conf/ +COPY notebook/ ./notebook/ +COPY main.py ./ + +# Install Python 3.12 and sync dependencies +RUN uv python install 3.12 && uv sync -# example directory to share file (-v output:/tmp/output) +# Output directory for results RUN mkdir -p /tmp/output +ENTRYPOINT ["/entrypoint.sh", "uv", "run", "python", "main.py", "run.docker=false", "pqos.enable=false", "run.stressor=false", "run.interactive=false", "run.command=caterpillar"] diff --git a/conf/config.yaml b/conf/config.yaml index da69bf4..fb26e58 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -17,7 +17,7 @@ bios: file: "${hydra:run.dir}/bios.json" pretty: true -# --- Intel CAT specific configuration --- +# --- Intel CAT specific configuration --- pqos: # # 'os' = Uses Linux resctrl (Required for PIDs) recommended @@ -25,7 +25,7 @@ pqos: # not upstream kernel interface: "os" enable: false - + # resets all allocations before applying new ones reset_before_apply: true @@ -37,7 +37,7 @@ pqos: #l2_mask: "0x00ff" mba: 100 # set PIDs to assign to CLOS leave empty if none - pids : [] + pids: [] # Option B: List of CPU cores assigned to this class, leave empty if none cores: [] - id: 0 @@ -46,13 +46,13 @@ pqos: l3_mask: "0x7f00" #l2_mask: "0xff00" mba: 10 - cores: [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + cores: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] pids: [115, 118] irq_affinity: enabled: false housekeeping_cores: "0-1" - + # --- Configuration related to tests running and parameters passed to it run: command: "caterpillar" @@ -60,6 +60,7 @@ run: max_count: 1 numa_node: "1" stressor: false + interactive: true # show progress bars and verbose output (set false in containers) # tests_path: "tests" metrics: false # enable metrics monitoring docker: false # run inside docker container, if false will run on host system diff --git a/cyclictest/Dockerfile b/cyclictest/Dockerfile index e8ebd10..fdd74c3 100644 --- a/cyclictest/Dockerfile +++ b/cyclictest/Dockerfile @@ -13,12 +13,33 @@ # License. # -FROM eci-base:latest -# Install required dependencies +ARG BASE_TAG=latest +FROM eci-base:${BASE_TAG} + +# Install cyclictest benchmark RUN apt-get update && apt-get install -y --no-install-recommends \ - rt-tests\ - && rm -rf /var/lib/apt/lists/* + rt-tests \ + && rm -rf /var/lib/apt/lists/* + ENV USER=root -# example directory to share file (-v output:/tmp/output) + +# Install uv package manager +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +WORKDIR /app + +# Copy project files +COPY pyproject.toml uv.lock* ./ +COPY src/ ./src/ +COPY conf/ ./conf/ +COPY notebook/ ./notebook/ +COPY main.py ./ + +# Install Python 3.12 and sync dependencies +RUN uv python install 3.12 && uv sync + +# Output directory for results RUN mkdir -p /tmp/output +ENTRYPOINT ["/entrypoint.sh", "uv", "run", "python", "main.py", "run.docker=false", "pqos.enable=false", "run.stressor=false", "run.interactive=false", "run.command=cyclictest"] diff --git a/main.py b/main.py index 161f18d..ac850e1 100644 --- a/main.py +++ b/main.py @@ -1,28 +1,28 @@ import argparse import os -import sys -import hydra import subprocess +import sys import threading +import hydra from omegaconf import DictConfig, OmegaConf from src.bios_settings import process_bios_settings - +from src.detect_cpus import detect_cpus +from src.irq_affinity import set_irq_affinity from src.metrics import ( CPUmonitor, + CpuStatMonitor, InterruptMonitor, MemInfoMonitor, - SoftIrqMonitor, - CpuStatMonitor, PQOSMonitor, + SoftIrqMonitor, ) -from src.sysinfo_collector import SystemInfoCollector from src.pqos_manager import PQOSManager -from src.irq_affinity import set_irq_affinity +from src.rt_preflight import run_preflight +from src.sysinfo_collector import SystemInfoCollector from src.test_runner import DockerTestRunner from src.hde2e import DockerHDE2E -from scr.detect_cpus import detect_cpus def setup_pqos(cfg: DictConfig) -> None: @@ -92,6 +92,9 @@ def setup_metrics(cfg: DictConfig) -> None: def run_test(cfg: DictConfig): + # Validate RT environment before doing anything else + run_preflight(strict=not cfg.run.docker) + collector = SystemInfoCollector() collector.gather_all(cfg) collector.dump_to_file(cfg.sysinfo_collector_file) @@ -173,7 +176,7 @@ def main(cfg: DictConfig): [Service] Type=oneshot -User={os.getenv('USER')} +User={os.getenv("USER")} WorkingDirectory={execution_dir} ExecStart=sudo ./env/python3 main.py RemainAfterExit=no diff --git a/src/detect_cpus.py b/src/detect_cpus.py index 9f02812..cb4a2d9 100644 --- a/src/detect_cpus.py +++ b/src/detect_cpus.py @@ -1,56 +1,101 @@ -from pathlib import Path +import logging import os +from pathlib import Path + +log = logging.getLogger(__name__) + +_CGROUP_V2_PATHS = (Path("/sys/fs/cgroup/cpuset.cpus.effective"),) + +_CGROUP_V1_PATHS = ( + Path("/sys/fs/cgroup/cpuset/cpuset.cpus"), + Path("/sys/fs/cgroup/cpuset/cpuset.effective_cpus"), +) def detect_cpus() -> str: - cpus = ( - _from_cgroup_v2() - or _from_proc_stat() - or _from_proc_cpuinfo() - or _from_sysconf() - ) - return str(cpus) if cpus else "" + """Detect which CPUs this process/container is allowed to run on. + Returns the cpuset string (e.g. "9,11" or "2-5") as seen by cgroup, + falling back to /proc/stat or sysconf if cgroup is unavailable. + """ + for source, result in ( + ("cgroup-v2", _from_cgroup_v2), + ("cgroup-v1", _from_cgroup_v1), + ("/proc/stat", _from_proc_stat), + ("sysconf", _from_sysconf), + ): + cpus = result() + if cpus: + log.info("Detected CPUs from %s: %s", source, cpus) + return cpus + log.debug("CPU detection via %s: not available", source) -def _from_cgroup_v2() -> int | None: - p = Path("/sys/fs/cgroup/cpuset.cpus.effective") - if not p.is_file(): - return None - total = 0 - for part in p.read_text().strip().split(","): - if "-" in part: - lo, hi = part.split("-", 1) - total += int(hi) - int(lo) + 1 - else: - total += 1 - return total or None + log.warning("Could not detect CPUs from any source") + return "" + + +def _from_cgroup_v2() -> str | None: + """Read allowed CPUs from cgroup v2 cpuset controller.""" + for p in _CGROUP_V2_PATHS: + if p.is_file(): + content = p.read_text().strip() + if content: + return content + return None -def _from_proc_stat() -> int | None: +def _from_cgroup_v1() -> str | None: + """Read allowed CPUs from cgroup v1 cpuset controller.""" + for p in _CGROUP_V1_PATHS: + if p.is_file(): + content = p.read_text().strip() + if content: + return content + return None + + +def _from_proc_stat() -> str | None: + """Parse /proc/stat to find online CPU numbers and return as list.""" p = Path("/proc/stat") if not p.is_file(): return None - count = sum( - 1 - for line in p.read_text().splitlines() - if line.startswith("cpu") and line[3:4].isdigit() - ) - return count or None + cpus = [] + for line in p.read_text().splitlines(): + if line.startswith("cpu") and line[3:4].isdigit(): + # line looks like "cpu0 ..." — extract the number + tag = line.split()[0] + cpus.append(int(tag[3:])) + if not cpus: + return None + return _compact(sorted(cpus)) -def _from_proc_cpuinfo() -> int | None: - p = Path("/proc/cpuinfo") - if not p.is_file(): +def _from_sysconf() -> str | None: + """Fall back to os.sysconf for online CPU count, return 0..N-1 range.""" + if not hasattr(os, "sysconf"): + return None + n = os.sysconf("SC_NPROCESSORS_ONLN") + if n <= 0: + n = os.sysconf("SC_NPROCESSORS_CONF") + if n <= 0: return None - count = sum( - 1 for line in p.read_text().splitlines() if line.startswith("processor") - ) - return count or None - - -def _from_sysconf() -> int | None: - n = os.sysconf("SC_NPROCESSORS_ONLN") if hasattr(os, "sysconf") else 0 - if n > 0: - return n - n = os.sysconf("SC_NPROCESSORS_CONF") if hasattr(os, "sysconf") else 0 - return n if n > 0 else None + return f"0-{n - 1}" if n > 1 else "0" + + +def _compact(cpus: list[int]) -> str: + """Turn a sorted list of ints into a compact range string. + + e.g. [0, 1, 2, 5, 7, 8, 9] -> "0-2,5,7-9" + """ + if not cpus: + return "" + ranges: list[str] = [] + start = prev = cpus[0] + for c in cpus[1:]: + if c == prev + 1: + prev = c + else: + ranges.append(f"{start}-{prev}" if start != prev else str(start)) + start = prev = c + ranges.append(f"{start}-{prev}" if start != prev else str(start)) + return ",".join(ranges) diff --git a/src/rt_preflight.py b/src/rt_preflight.py new file mode 100644 index 0000000..ef8c074 --- /dev/null +++ b/src/rt_preflight.py @@ -0,0 +1,815 @@ +"""RT pre-flight validation. + +Runs at container startup to verify the host kernel and cgroup +environment are properly configured for real-time benchmarks. + +Usage: + from src.rt_preflight import run_preflight + run_preflight() # logs results, raises on FAIL + run_preflight(strict=False) # logs results, never raises +""" + +import logging +import os +import re +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from typing import Optional + +log = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Result types +# --------------------------------------------------------------------------- + + +class Status(Enum): + PASS = "PASS" + WARN = "WARN" + FAIL = "FAIL" + SKIP = "SKIP" + + +@dataclass +class CheckResult: + name: str + status: Status + message: str + detail: str = "" + + +@dataclass +class PreflightReport: + results: list[CheckResult] = field(default_factory=list) + + def add(self, result: CheckResult) -> None: + self.results.append(result) + + @property + def passed(self) -> bool: + return all(r.status != Status.FAIL for r in self.results) + + def summary(self) -> str: + lines = [ + "", + "=" * 64, + " RT PRE-FLIGHT CHECK", + "=" * 64, + ] + for r in self.results: + icon = { + Status.PASS: "\u2705", + Status.WARN: "\u26a0\ufe0f ", + Status.FAIL: "\u274c", + Status.SKIP: "\u23ed\ufe0f ", + }.get(r.status, "?") + lines.append(f" {icon} [{r.status.value:4s}] {r.name}: {r.message}") + if r.detail: + for dl in r.detail.strip().splitlines(): + lines.append(f" {dl}") + lines.append("=" * 64) + n_pass = sum(1 for r in self.results if r.status == Status.PASS) + n_warn = sum(1 for r in self.results if r.status == Status.WARN) + n_fail = sum(1 for r in self.results if r.status == Status.FAIL) + n_skip = sum(1 for r in self.results if r.status == Status.SKIP) + lines.append( + f" Total: {len(self.results)} " + f"Pass: {n_pass} Warn: {n_warn} Fail: {n_fail} Skip: {n_skip}" + ) + verdict = "READY" if self.passed else "NOT READY" + lines.append(f" Verdict: {verdict}") + lines.append("=" * 64) + lines.append("") + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _read(path: str | Path) -> Optional[str]: + """Read a file, return None if missing / unreadable.""" + try: + return Path(path).read_text().strip() + except (OSError, PermissionError): + return None + + +def _cmdline() -> str: + return _read("/proc/cmdline") or "" + + +def _cmdline_param(name: str) -> Optional[str]: + """Extract value of name= from /proc/cmdline, or None.""" + m = re.search(rf"(?:^|\s){re.escape(name)}=(\S+)", _cmdline()) + return m.group(1) if m else None + + +def _cmdline_has(name: str) -> bool: + """Check if a param (with or without value) exists in cmdline.""" + return bool(re.search(rf"(?:^|\s){re.escape(name)}(?:=|\s|$)", _cmdline())) + + +def _parse_cpulist(s: str) -> set[int]: + """Parse '1-3,5,7-9' into {1,2,3,5,7,8,9}.""" + cpus: set[int] = set() + for part in s.split(","): + part = part.strip() + if not part: + continue + if "-" in part: + lo, hi = part.split("-", 1) + cpus.update(range(int(lo), int(hi) + 1)) + else: + cpus.add(int(part)) + return cpus + + +def _get_container_cpus() -> Optional[str]: + """Read the cpuset that this container is allowed to use.""" + for p in ( + Path("/sys/fs/cgroup/cpuset.cpus.effective"), + Path("/sys/fs/cgroup/cpuset/cpuset.cpus"), + Path("/sys/fs/cgroup/cpuset/cpuset.effective_cpus"), + ): + content = _read(p) + if content: + return content + return None + + +# --------------------------------------------------------------------------- +# Individual checks +# --------------------------------------------------------------------------- + + +def check_preempt_rt() -> CheckResult: + """Verify kernel is PREEMPT_RT.""" + name = "PREEMPT_RT kernel" + + # Method 1: /sys/kernel/realtime + rt_flag = _read("/sys/kernel/realtime") + if rt_flag == "1": + uname = _read("/proc/version") or "" + return CheckResult( + name, + Status.PASS, + "RT kernel confirmed", + detail=uname.split()[0:3].__repr__() if uname else "", + ) + + # Method 2: uname string + version = _read("/proc/version") or "" + if "PREEMPT_RT" in version: + return CheckResult( + name, Status.PASS, "RT kernel (from /proc/version)", detail=version[:120] + ) + + if "PREEMPT" in version: + return CheckResult( + name, Status.WARN, "PREEMPT kernel but not PREEMPT_RT", detail=version[:120] + ) + + return CheckResult( + name, + Status.FAIL, + "Kernel does not appear to be PREEMPT_RT", + detail=version[:120], + ) + + +def check_isolcpus() -> CheckResult: + """Check that isolcpus is set and our container cpus are in the isolated set.""" + name = "CPU isolation (isolcpus)" + + isolated_param = _cmdline_param("isolcpus") + if not isolated_param: + # Also check sysfs + isolated_sysfs = _read("/sys/devices/system/cpu/isolated") + if isolated_sysfs: + isolated_param = isolated_sysfs + else: + return CheckResult( + name, Status.WARN, "isolcpus not found in cmdline or sysfs" + ) + + isolated_set = _parse_cpulist(isolated_param) + + container_cpus_str = _get_container_cpus() + if not container_cpus_str: + return CheckResult( + name, + Status.PASS, + f"isolcpus={isolated_param} (could not read container cpuset)", + detail="Cannot verify overlap — cgroup cpuset not readable", + ) + + container_set = _parse_cpulist(container_cpus_str) + not_isolated = container_set - isolated_set + if not_isolated: + return CheckResult( + name, + Status.WARN, + f"Container CPUs {sorted(not_isolated)} are NOT in isolcpus={isolated_param}", + detail=f"Container cpuset: {container_cpus_str}\n" + f"Isolated: {isolated_param}", + ) + + return CheckResult( + name, + Status.PASS, + f"All container CPUs ({container_cpus_str}) are isolated", + detail=f"isolcpus={isolated_param}", + ) + + +def check_nohz_full() -> CheckResult: + """Check nohz_full covers our container cores.""" + name = "Tickless (nohz_full)" + + nohz = _cmdline_param("nohz_full") + if not nohz: + return CheckResult( + name, Status.WARN, "nohz_full not set — timer tick will interrupt RT cores" + ) + + nohz_set = _parse_cpulist(nohz) + container_cpus_str = _get_container_cpus() + if not container_cpus_str: + return CheckResult( + name, Status.PASS, f"nohz_full={nohz} (cannot verify container overlap)" + ) + + container_set = _parse_cpulist(container_cpus_str) + missing = container_set - nohz_set + if missing: + return CheckResult( + name, + Status.WARN, + f"Container CPUs {sorted(missing)} not in nohz_full={nohz}", + detail="These cores will still receive timer ticks", + ) + + return CheckResult( + name, + Status.PASS, + f"All container CPUs ({container_cpus_str}) are tickless", + detail=f"nohz_full={nohz}", + ) + + +def check_rcu_nocbs() -> CheckResult: + """Check rcu_nocbs covers our container cores.""" + name = "RCU offloading (rcu_nocbs)" + + rcu = _cmdline_param("rcu_nocbs") + if not rcu: + return CheckResult( + name, Status.WARN, "rcu_nocbs not set — RCU callbacks may run on RT cores" + ) + + rcu_set = _parse_cpulist(rcu) + container_cpus_str = _get_container_cpus() + if not container_cpus_str: + return CheckResult( + name, Status.PASS, f"rcu_nocbs={rcu} (cannot verify container overlap)" + ) + + container_set = _parse_cpulist(container_cpus_str) + missing = container_set - rcu_set + if missing: + return CheckResult( + name, + Status.WARN, + f"Container CPUs {sorted(missing)} not in rcu_nocbs={rcu}", + ) + + return CheckResult( + name, + Status.PASS, + f"All container CPUs ({container_cpus_str}) have RCU callbacks offloaded", + detail=f"rcu_nocbs={rcu}", + ) + + +def check_irqaffinity() -> CheckResult: + """Check irqaffinity is set to housekeeping cores only.""" + name = "IRQ affinity (irqaffinity)" + + affinity = _cmdline_param("irqaffinity") + if not affinity: + return CheckResult( + name, + Status.WARN, + "irqaffinity not set in cmdline — IRQs may land on RT cores", + ) + + affinity_set = _parse_cpulist(affinity) + container_cpus_str = _get_container_cpus() + if container_cpus_str: + container_set = _parse_cpulist(container_cpus_str) + overlap = container_set & affinity_set + if overlap: + return CheckResult( + name, + Status.WARN, + f"irqaffinity={affinity} overlaps container CPUs {sorted(overlap)}", + detail="IRQs may be routed to RT cores", + ) + + return CheckResult( + name, Status.PASS, f"IRQs pinned to housekeeping cores ({affinity})" + ) + + +def check_cstates() -> CheckResult: + """Verify C-states are disabled.""" + name = "C-states disabled" + + cstate_params = [ + "intel.max_cstate", + "intel_idle.max_cstate", + "processor.max_cstate", + "processor_idle.max_cstate", + ] + + cmdline = _cmdline() + found = {} + for param in cstate_params: + val = _cmdline_param(param) + if val is not None: + found[param] = val + + if not found: + return CheckResult( + name, + Status.WARN, + "No max_cstate parameters found in cmdline", + detail="C-states may cause latency spikes", + ) + + non_zero = {k: v for k, v in found.items() if v != "0"} + if non_zero: + return CheckResult( + name, + Status.WARN, + f"Some max_cstate params are not 0: {non_zero}", + detail="\n".join(f" {k}={v}" for k, v in found.items()), + ) + + # Also check for the known typo + typo_params = ["rocessor.max_cstate", "rocessor_idle.max_cstate"] + typos_found = [] + for tp in typo_params: + if tp in cmdline: + typos_found.append(tp) + + detail = "\n".join(f" {k}={v}" for k, v in found.items()) + if typos_found: + detail += f"\n WARNING: Possible typo in cmdline: {', '.join(typos_found)}" + detail += "\n (missing leading 'p' — parameter is being ignored by kernel)" + return CheckResult( + name, + Status.WARN, + "C-states look disabled but found typo in cmdline", + detail=detail, + ) + + return CheckResult( + name, Status.PASS, "All C-states disabled via kernel cmdline", detail=detail + ) + + +def check_intel_pstate() -> CheckResult: + """Check intel_pstate is disabled (allows direct freq control).""" + name = "Intel P-state driver" + + val = _cmdline_param("intel_pstate") + if val == "disable": + return CheckResult(name, Status.PASS, "intel_pstate=disable (good for RT)") + + if val: + return CheckResult( + name, + Status.WARN, + f"intel_pstate={val} — consider intel_pstate=disable for RT", + ) + + # Check if the driver is active + pstate_dir = Path("/sys/devices/system/cpu/intel_pstate") + if pstate_dir.is_dir(): + return CheckResult( + name, Status.WARN, "intel_pstate driver is active — frequency may fluctuate" + ) + + return CheckResult(name, Status.PASS, "intel_pstate not active") + + +def check_cpu_governor() -> CheckResult: + """Check CPU frequency governor on container cores.""" + name = "CPU frequency governor" + + container_cpus_str = _get_container_cpus() + if not container_cpus_str: + return CheckResult(name, Status.SKIP, "Cannot determine container CPUs") + + governors: dict[str, list[int]] = {} + for cpu in sorted(_parse_cpulist(container_cpus_str)): + gov = _read(f"/sys/devices/system/cpu/cpu{cpu}/cpufreq/scaling_governor") + if gov: + governors.setdefault(gov, []).append(cpu) + + if not governors: + # intel_pstate=disable + no cpufreq driver = no governor file + pstate_val = _cmdline_param("intel_pstate") + if pstate_val == "disable": + return CheckResult( + name, + Status.PASS, + "No cpufreq governor (intel_pstate disabled, frequency is fixed)", + ) + return CheckResult( + name, Status.SKIP, "Could not read governor for any container CPU" + ) + + if set(governors.keys()) == {"performance"}: + return CheckResult( + name, Status.PASS, "All container CPUs on 'performance' governor" + ) + + detail_lines = [] + for gov, cpus in sorted(governors.items()): + detail_lines.append(f" {gov}: CPUs {cpus}") + non_perf = {g for g in governors if g != "performance"} + return CheckResult( + name, + Status.WARN, + f"Non-performance governors found: {non_perf}", + detail="\n".join(detail_lines), + ) + + +def check_clocksource() -> CheckResult: + """Verify clocksource is TSC.""" + name = "Clocksource" + + current = _read("/sys/devices/system/clocksource/clocksource0/current_clocksource") + if not current: + return CheckResult(name, Status.SKIP, "Cannot read current clocksource") + + available = ( + _read("/sys/devices/system/clocksource/clocksource0/available_clocksource") + or "" + ) + + if current == "tsc": + return CheckResult( + name, Status.PASS, "Clocksource is TSC", detail=f"Available: {available}" + ) + + return CheckResult( + name, + Status.WARN, + f"Clocksource is '{current}' (TSC preferred for RT)", + detail=f"Available: {available}", + ) + + +def check_numa_balancing() -> CheckResult: + """Check NUMA balancing is disabled.""" + name = "NUMA balancing" + + val = _cmdline_param("numa_balancing") + if val == "disable": + return CheckResult(name, Status.PASS, "NUMA balancing disabled via cmdline") + + sysctl = _read("/proc/sys/kernel/numa_balancing") + if sysctl == "0": + return CheckResult(name, Status.PASS, "NUMA balancing disabled (sysctl)") + + if sysctl == "1": + return CheckResult( + name, + Status.WARN, + "NUMA balancing is enabled — may cause latency jitter", + detail="Set numa_balancing=disable on kernel cmdline", + ) + + return CheckResult(name, Status.SKIP, "Cannot determine NUMA balancing state") + + +def check_split_lock() -> CheckResult: + """Check split_lock_detect is off.""" + name = "Split-lock detection" + + val = _cmdline_param("split_lock_detect") + if val == "off": + return CheckResult(name, Status.PASS, "split_lock_detect=off") + + if val: + return CheckResult( + name, + Status.WARN, + f"split_lock_detect={val} — may cause unexpected #AC exceptions", + ) + + return CheckResult(name, Status.WARN, "split_lock_detect not set in cmdline") + + +def check_hugepages() -> CheckResult: + """Check hugepages are allocated.""" + name = "Hugepages" + + cmdline_val = _cmdline_param("hugepages") + nr = _read("/proc/sys/vm/nr_hugepages") + free = _read("/proc/meminfo") + + hp_info = "" + if free: + for line in free.splitlines(): + if "Huge" in line: + hp_info += f" {line.strip()}\n" + + if cmdline_val and int(cmdline_val) > 0: + return CheckResult( + name, + Status.PASS, + f"hugepages={cmdline_val} configured", + detail=hp_info.rstrip(), + ) + + if nr and int(nr) > 0: + return CheckResult( + name, Status.PASS, f"{nr} hugepages available", detail=hp_info.rstrip() + ) + + return CheckResult(name, Status.WARN, "No hugepages configured") + + +def check_container_cpuset() -> CheckResult: + """Report which CPUs this container is pinned to.""" + name = "Container cpuset" + + cpus = _get_container_cpus() + if not cpus: + return CheckResult( + name, + Status.FAIL, + "Cannot detect container CPU assignment", + detail="Checked cgroup v1 and v2 cpuset paths", + ) + + cpu_set = _parse_cpulist(cpus) + return CheckResult( + name, Status.PASS, f"Pinned to CPUs: {cpus} ({len(cpu_set)} cores)" + ) + + +def check_kernel_thread_priorities() -> CheckResult: + """Scan for kernel threads running at priority >= 90 that could interfere.""" + name = "Kernel thread priorities" + + container_cpus_str = _get_container_cpus() + container_set = _parse_cpulist(container_cpus_str) if container_cpus_str else None + + high_prio: list[str] = [] + proc = Path("/proc") + for pid_dir in proc.iterdir(): + if not pid_dir.name.isdigit(): + continue + try: + comm = (pid_dir / "comm").read_text().strip() + sched_lines = (pid_dir / "sched").read_text().splitlines() + # Check if it's an RT task by reading /proc//stat + stat = (pid_dir / "stat").read_text() + # Field 41 is the policy (1=FIFO, 2=RR) and field 18 is priority + fields = stat.rsplit(")", 1)[-1].split() + # fields[0] = state, fields[15] = priority, fields[38] = rt_priority, fields[39] = policy + if len(fields) > 39: + rt_prio = int( + fields[37] + ) # rt_priority (0-indexed from field after ')') + policy = int(fields[38]) + if policy in (1, 2) and rt_prio >= 90: + # Check CPU affinity if possible + try: + affinity = os.sched_getaffinity(int(pid_dir.name)) + if container_set and affinity & container_set: + high_prio.append( + f"PID {pid_dir.name} ({comm}): " + f"policy={'FIFO' if policy == 1 else 'RR'} " + f"prio={rt_prio} cpus={sorted(affinity & container_set)}" + ) + except (OSError, PermissionError): + pass + except (OSError, PermissionError, ValueError, IndexError): + continue + + if not high_prio: + return CheckResult( + name, Status.PASS, "No competing high-priority RT threads on container CPUs" + ) + + return CheckResult( + name, + Status.WARN, + f"{len(high_prio)} kernel thread(s) at RT priority >= 90 on container CPUs", + detail="\n".join(high_prio), + ) + + +def check_capabilities() -> CheckResult: + """Report container Linux capabilities and verify RT-required ones.""" + name = "Container capabilities" + + # All known Linux capabilities (as of kernel 6.x) + _CAP_NAMES = { + 0: "CAP_CHOWN", + 1: "CAP_DAC_OVERRIDE", + 2: "CAP_DAC_READ_SEARCH", + 3: "CAP_FOWNER", + 4: "CAP_FSETID", + 5: "CAP_KILL", + 6: "CAP_SETGID", + 7: "CAP_SETUID", + 8: "CAP_SETPCAP", + 9: "CAP_LINUX_IMMUTABLE", + 10: "CAP_NET_BIND_SERVICE", + 11: "CAP_NET_BROADCAST", + 12: "CAP_NET_ADMIN", + 13: "CAP_NET_RAW", + 14: "CAP_IPC_LOCK", + 15: "CAP_IPC_OWNER", + 16: "CAP_SYS_MODULE", + 17: "CAP_SYS_RAWIO", + 18: "CAP_SYS_CHROOT", + 19: "CAP_SYS_PTRACE", + 20: "CAP_SYS_PACCT", + 21: "CAP_SYS_ADMIN", + 22: "CAP_SYS_BOOT", + 23: "CAP_SYS_NICE", + 24: "CAP_SYS_RESOURCE", + 25: "CAP_SYS_TIME", + 26: "CAP_SYS_TTY_CONFIG", + 27: "CAP_MKNOD", + 28: "CAP_LEASE", + 29: "CAP_AUDIT_WRITE", + 30: "CAP_AUDIT_CONTROL", + 31: "CAP_SETFCAP", + 32: "CAP_MAC_OVERRIDE", + 33: "CAP_MAC_ADMIN", + 34: "CAP_SYSLOG", + 35: "CAP_WAKE_ALARM", + 36: "CAP_BLOCK_SUSPEND", + 37: "CAP_AUDIT_READ", + 38: "CAP_PERFMON", + 39: "CAP_BPF", + 40: "CAP_CHECKPOINT_RESTORE", + } + + # Capabilities required for RT benchmarks + _RT_REQUIRED = { + 23: "CAP_SYS_NICE", # chrt, sched_setscheduler, RT priorities + 14: "CAP_IPC_LOCK", # mlockall, locking memory pages + 21: "CAP_SYS_ADMIN", # cgroup, /dev access, various RT ops + } + + # Nice to have for RT + _RT_OPTIONAL = { + 17: "CAP_SYS_RAWIO", # MSR access (SMI counters, etc.) + 24: "CAP_SYS_RESOURCE", # override RLIMIT_RTPRIO + 12: "CAP_NET_ADMIN", # network tuning, IRQ affinity + } + + status_text = _read("/proc/self/status") + if not status_text: + return CheckResult(name, Status.SKIP, "Cannot read /proc/self/status") + + # Parse CapEff (effective), CapPrm (permitted), CapBnd (bounding) + caps: dict[str, int] = {} + for line in status_text.splitlines(): + for field in ("CapEff", "CapPrm", "CapBnd"): + if line.startswith(f"{field}:"): + caps[field] = int(line.split(":")[1].strip(), 16) + + cap_eff = caps.get("CapEff", 0) + + def _decode(bitmask: int) -> list[str]: + found = [] + for bit in range(41): + if bitmask & (1 << bit): + found.append(_CAP_NAMES.get(bit, f"CAP_{bit}")) + return found + + effective = _decode(cap_eff) + + # Check required caps + missing_required = [] + for bit, cap_name in _RT_REQUIRED.items(): + if not (cap_eff & (1 << bit)): + missing_required.append(cap_name) + + missing_optional = [] + for bit, cap_name in _RT_OPTIONAL.items(): + if not (cap_eff & (1 << bit)): + missing_optional.append(cap_name) + + # Build detail: show all effective caps + detail_lines = [f"Effective ({len(effective)}): {', '.join(effective)}"] + if missing_optional: + detail_lines.append(f"Optional missing: {', '.join(missing_optional)}") + + detail = "\n".join(detail_lines) + + if missing_required: + return CheckResult( + name, + Status.FAIL, + f"Missing required RT capabilities: {', '.join(missing_required)}", + detail=detail, + ) + + if missing_optional: + return CheckResult( + name, + Status.WARN, + f"All required caps present, optional missing: {', '.join(missing_optional)}", + detail=detail, + ) + + return CheckResult( + name, + Status.PASS, + f"All RT capabilities present ({len(effective)} total effective)", + detail=detail, + ) + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +ALL_CHECKS = [ + check_container_cpuset, + check_capabilities, + check_preempt_rt, + check_isolcpus, + check_nohz_full, + check_rcu_nocbs, + check_irqaffinity, + check_cstates, + check_intel_pstate, + check_cpu_governor, + check_clocksource, + check_numa_balancing, + check_split_lock, + check_hugepages, + check_kernel_thread_priorities, +] + + +def run_preflight(strict: bool = True) -> PreflightReport: + """Run all pre-flight checks. + + Args: + strict: If True, raise RuntimeError when any check FAILs. + + Returns: + PreflightReport with all results. + """ + report = PreflightReport() + + for check_fn in ALL_CHECKS: + try: + result = check_fn() + except Exception as exc: + result = CheckResult( + name=check_fn.__doc__ or check_fn.__name__, + status=Status.SKIP, + message=f"Check raised exception: {exc}", + ) + report.add(result) + + summary = report.summary() + # Always print to stdout so it's visible in container logs + print(summary) + # Also log it + log.info(summary) + + if strict and not report.passed: + raise RuntimeError( + "RT pre-flight checks failed. " + "Fix the issues above or pass strict=False to skip." + ) + + return report + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.DEBUG, format="%(levelname)s %(name)s - %(message)s" + ) + run_preflight(strict=False) diff --git a/src/test_runner.py b/src/test_runner.py index 9c53d19..8687a4d 100644 --- a/src/test_runner.py +++ b/src/test_runner.py @@ -1,21 +1,24 @@ import csv import io +import logging +import shlex import subprocess -import psutil import time -import shlex - +from pathlib import Path +from subprocess import CompletedProcess, Popen from typing import List, Optional + +import psutil from omegaconf import DictConfig, OmegaConf -from pathlib import Path from tqdm import tqdm -from subprocess import CompletedProcess, Popen -from src.pqos_manager import PQOSManager +log = logging.getLogger(__name__) + +from src.pqos_manager import PQOSManager from src.test_output_parser import ( + MegabenchParser, build_caterpillar_parser, build_cyclictest_parser, - MegabenchParser, ) @@ -174,7 +177,11 @@ def run_test(self, test: str, t_core: str, stressor: bool = False) -> int: print(f" Target Core(s): {t_core}") print(f" Stressor: {stressor}") - docker_cmd = self._build_base_docker_command(test, t_core) + docker_cmd = ( + self._build_base_docker_command(test, t_core) + if self.config.run.docker + else [] + ) if test == "caterpillar": return self._run_caterpillar( @@ -230,23 +237,25 @@ def _run_caterpillar(self, base_cmd: List[str], t_core: str, path: str) -> int: f"-c {t_core} -s {self.config.caterpillar.n_cycles}" ) if self.config.run.docker: - rdtset_cmd = f"stdbuf -oL -eL " f"{caterpillar_cmd}" cmd = base_cmd + [ "caterpillar:latest", "/bin/bash", "-c", - rdtset_cmd, + f"stdbuf -oL -eL {caterpillar_cmd}", ] else: cmd = shlex.split(caterpillar_cmd) - print(" ".join(cmd)) + log.info("Running: %s", " ".join(cmd)) + interactive = self.config.run.get("interactive", True) try: process = self._run_interactive_command(cmd) assert process.stdout is not None - pbar = tqdm(total=self.config.caterpillar.n_cycles) + n_cycles = self.config.caterpillar.n_cycles + count = 0 parser = build_caterpillar_parser() + pbar = tqdm(total=n_cycles, desc="caterpillar") if interactive else None with open(path, "w") as f: prelude = parser.prelude() @@ -256,9 +265,16 @@ def _run_caterpillar(self, base_cmd: List[str], t_core: str, path: str) -> int: for line in process.stdout: parsed = parser.parse(line) if parsed is not None: - pbar.update(1) + count += 1 f.write(parsed) - pbar.close() + if pbar: + pbar.update(1) + elif count % 1000 == 0 or count == n_cycles: + print(f"caterpillar: {count}/{n_cycles} cycles") + + if pbar: + pbar.close() + print(f"caterpillar: completed {count} cycles, results in {path}") except KeyboardInterrupt: process.terminate() @@ -278,45 +294,46 @@ def _run_cyclictest( ) -> int: """Run cyclictest.""" cyclictest_cmd = ( - # "chrt -r 95 " f"/usr/bin/cyclictest --threads -t 1 -p 95 " f"-l {cycles} -d 1 -D 0 -i {self.config.caterpillar.n_cycles} -a {t_core}" ) if self.config.run.docker: - rdtset_cmd = f"stdbuf -oL -eL " f"{cyclictest_cmd}" cmd = base_cmd + [ "cyclictest:latest", "/bin/bash", "-c", - rdtset_cmd, + f"stdbuf -oL -eL {cyclictest_cmd}", ] else: cmd = shlex.split(cyclictest_cmd) - print(" ".join(cmd)) + log.info("Running: %s", " ".join(cmd)) + interactive = self.config.run.get("interactive", True) process = self._run_interactive_command(cmd) assert process.stdout is not None - pbar = tqdm(total=400000) last_c_value = 0 + total_lines = 0 parser = build_cyclictest_parser() + pbar = tqdm(total=int(cycles), desc="cyclictest") if interactive else None with open(path, "w") as f: prelude = parser.prelude() if prelude is not None: f.write(prelude) for line in process.stdout: - print(line) + if interactive: + print(line, end="") parsed = parser.parse(line) if parsed is None: continue + total_lines += 1 f.write(parsed) try: - # Use csv reader to safely split by commas reader = csv.DictReader( io.StringIO(parsed), fieldnames=parser.headers, @@ -324,16 +341,22 @@ def _run_cyclictest( row = next(reader) c_val = int(row["C"]) - # --- Update tqdm only if C increases --- if c_val > last_c_value: - pbar.update(c_val - last_c_value) + delta = c_val - last_c_value last_c_value = c_val + if pbar: + pbar.update(delta) + elif c_val % 10000 == 0: + print(f"cyclictest: {c_val}/{cycles} loops") except Exception as e: - # Optional: log or ignore malformed lines - print(f"Warning: could not parse C value: {e}") + log.debug("Could not parse C value: %s", e) - pbar.close() + if pbar: + pbar.close() + print( + f"cyclictest: completed {last_c_value} loops ({total_lines} samples), results in {path}" + ) return process.wait() From 5783ddffb98c69ccea0f423ad18294e1a14b61c7 Mon Sep 17 00:00:00 2001 From: Mikhail Malyshev Date: Tue, 17 Feb 2026 05:21:08 +0100 Subject: [PATCH 11/15] pin service threads to housekeeping core, isolate benchmark cores MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Entrypoint dynamically splits the container cpuset at boot: - First core → housekeeping (entrypoint, sshd, python, uv) - Remaining cores → benchmarks only (exported as RT_BENCHMARK_CORES) All service processes inherit the housekeeping affinity via taskset. detect_cpus() reads RT_BENCHMARK_CORES first, so caterpillar/cyclictest only receive the clean cores. No hardcoded core numbers — fully dynamic from cgroup cpuset at runtime. Signed-off-by: Mikhail Malyshev --- Dockerfile.base | 63 +++++++++++++++++++++++++++++++++++++++++++--- src/detect_cpus.py | 13 ++++++++-- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/Dockerfile.base b/Dockerfile.base index 0a1fd23..4b938f5 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -123,9 +123,61 @@ EXPOSE 22 8888 # Entrypoint script: start sshd, run CMD, keep container alive RUN cat > /entrypoint.sh <<'ENTRY' #!/bin/bash + +# --- Detect cpuset and split into housekeeping vs benchmark cores --- +CPUSET="" +for f in /sys/fs/cgroup/cpuset.cpus.effective \ + /sys/fs/cgroup/cpuset/cpuset.cpus \ + /sys/fs/cgroup/cpuset/cpuset.effective_cpus; do + if [ -f "$f" ]; then + CPUSET=$(cat "$f") + break + fi +done + +# Expand cpuset string (e.g. "2,4-6") into a sorted list of cores +expand_cpuset() { + local result="" + IFS=',' read -ra parts <<< "$1" + for part in "${parts[@]}"; do + if [[ "$part" == *-* ]]; then + IFS='-' read -r lo hi <<< "$part" + for ((i=lo; i<=hi; i++)); do + result="$result $i" + done + else + result="$result $part" + fi + done + echo "$result" | tr ' ' '\n' | sort -n | tr '\n' ' ' +} + +ALL_CORES=( $(expand_cpuset "$CPUSET") ) +NUM_CORES=${#ALL_CORES[@]} + +if [ "$NUM_CORES" -ge 2 ]; then + # First core is housekeeping, rest are for benchmarks + HOUSEKEEPING_CORE=${ALL_CORES[0]} + BENCHMARK_CORES=$(IFS=,; echo "${ALL_CORES[*]:1}") + echo "CPU layout: housekeeping=${HOUSEKEEPING_CORE} benchmark=${BENCHMARK_CORES} (from cpuset: ${CPUSET})" +else + # Only one core, everything shares it + HOUSEKEEPING_CORE=${ALL_CORES[0]:-0} + BENCHMARK_CORES=${ALL_CORES[0]:-0} + echo "WARNING: Only ${NUM_CORES} core(s) available (${CPUSET}), no isolation possible" +fi + +# Export for child processes (main.py can read these) +export RT_HOUSEKEEPING_CORE="$HOUSEKEEPING_CORE" +export RT_BENCHMARK_CORES="$BENCHMARK_CORES" + +# Pin this shell (and all children: sshd, python, uv) to housekeeping core +taskset -pc "$HOUSEKEEPING_CORE" $$ 2>/dev/null || true + +# --- Start services pinned to housekeeping core --- mkdir -p /run/sshd -echo "Starting SSH server..." -/usr/sbin/sshd -e || echo "WARNING: sshd failed to start (exit code $?)" +echo "Starting SSH server on core ${HOUSEKEEPING_CORE}..." +taskset -c "$HOUSEKEEPING_CORE" /usr/sbin/sshd -e || echo "WARNING: sshd failed to start (exit code $?)" IP=$(hostname -I 2>/dev/null | awk '{print $1}') IP=${IP:-} @@ -135,6 +187,9 @@ echo "============================================================" echo " RT Benchmarking Container" echo "============================================================" echo "" +echo " Housekeeping core: ${HOUSEKEEPING_CORE}" +echo " Benchmark cores: ${BENCHMARK_CORES}" +echo "" echo " SSH: ssh -i root@${IP}" echo " Jupyter: ssh in, then run: jupyter-start" echo " or from this console:" @@ -164,8 +219,8 @@ if [ $# -gt 0 ]; then echo "" fi -echo "Container staying alive (sshd running). SSH in or ctrl-c to stop." -exec /usr/sbin/sshd -D +echo "Container staying alive (sshd running on core ${HOUSEKEEPING_CORE}). SSH in or ctrl-c to stop." +exec taskset -c "$HOUSEKEEPING_CORE" /usr/sbin/sshd -D ENTRY RUN chmod +x /entrypoint.sh diff --git a/src/detect_cpus.py b/src/detect_cpus.py index cb4a2d9..5229361 100644 --- a/src/detect_cpus.py +++ b/src/detect_cpus.py @@ -15,9 +15,18 @@ def detect_cpus() -> str: """Detect which CPUs this process/container is allowed to run on. - Returns the cpuset string (e.g. "9,11" or "2-5") as seen by cgroup, - falling back to /proc/stat or sysconf if cgroup is unavailable. + Returns the cpuset string (e.g. "9,11" or "2-5"). + + Prefers RT_BENCHMARK_CORES env var (set by entrypoint.sh) which + excludes the housekeeping core. Falls back to cgroup, /proc/stat, + or sysconf. """ + # Entrypoint sets this to the clean cores (excluding housekeeping) + env_cores = os.environ.get("RT_BENCHMARK_CORES", "").strip() + if env_cores: + log.info("Detected CPUs from RT_BENCHMARK_CORES env: %s", env_cores) + return env_cores + for source, result in ( ("cgroup-v2", _from_cgroup_v2), ("cgroup-v1", _from_cgroup_v1), From f4e7049ff4f078d6a707645a75ab5651719af967 Mon Sep 17 00:00:00 2001 From: Mikhail Malyshev Date: Tue, 17 Feb 2026 13:18:21 +0100 Subject: [PATCH 12/15] fix: keep container alive after benchmark, record effective cores in sysinfo - Replace exec sshd -D with sleep infinity to prevent container exit (sshd already running in background, second instance failed on port conflict) - Move detect_cpus() before sysinfo collection so effective cores are captured in sysinfo.json under new "runtime" section (effective_cores, housekeeping_core, source, config_cores) Signed-off-by: Mikhail Malyshev --- Dockerfile.base | 2 +- main.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/Dockerfile.base b/Dockerfile.base index 4b938f5..97bee40 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -220,7 +220,7 @@ if [ $# -gt 0 ]; then fi echo "Container staying alive (sshd running on core ${HOUSEKEEPING_CORE}). SSH in or ctrl-c to stop." -exec taskset -c "$HOUSEKEEPING_CORE" /usr/sbin/sshd -D +sleep infinity ENTRY RUN chmod +x /entrypoint.sh diff --git a/main.py b/main.py index ac850e1..914764c 100644 --- a/main.py +++ b/main.py @@ -95,8 +95,24 @@ def run_test(cfg: DictConfig): # Validate RT environment before doing anything else run_preflight(strict=not cfg.run.docker) + # Detect effective cores early so sysinfo captures the real values + cores = detect_cpus() + if cores == "": + cores = cfg.run.t_core + + housekeeping_core = os.environ.get("RT_HOUSEKEEPING_CORE", "") + collector = SystemInfoCollector() collector.gather_all(cfg) + collector.info["runtime"] = { + "effective_cores": cores, + "config_cores": cfg.run.t_core, + "housekeeping_core": housekeeping_core or "N/A", + "source": "RT_BENCHMARK_CORES" + if os.environ.get("RT_BENCHMARK_CORES") + else "cgroup/fallback", + "command": cfg.run.command, + } collector.dump_to_file(cfg.sysinfo_collector_file) # Collect BIOS settings via redfish @@ -140,10 +156,6 @@ def run_test(cfg: DictConfig): if cfg.irq_affinity.enabled: set_irq_affinity(cfg.irq_affinity.housekeeping_cores) - cores = detect_cpus() - if cores == "": - cores = cfg.run.t_core - return runner.run_test(cfg.run.command, cores, cfg.run.stressor) From 059915050a7cf69d2a3c11aacf5371852d5c5c26 Mon Sep 17 00:00:00 2001 From: Mikhail Malyshev Date: Sun, 1 Mar 2026 23:19:41 +0100 Subject: [PATCH 13/15] fix: add demo.demo_mode=false to container ENTRYPOINTs After rebase onto main, the demo_mode branch (from hde2e PR) is now in the code path. Containers must explicitly opt out to avoid hitting the DockerHDE2E path instead of DockerTestRunner. Config default remains demo_mode=true for bare-metal HDE2E workflow. --- caterpillar/Dockerfile | 2 +- cyclictest/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/caterpillar/Dockerfile b/caterpillar/Dockerfile index 67abd0f..3d1eaff 100644 --- a/caterpillar/Dockerfile +++ b/caterpillar/Dockerfile @@ -43,4 +43,4 @@ RUN uv python install 3.12 && uv sync # Output directory for results RUN mkdir -p /tmp/output -ENTRYPOINT ["/entrypoint.sh", "uv", "run", "python", "main.py", "run.docker=false", "pqos.enable=false", "run.stressor=false", "run.interactive=false", "run.command=caterpillar"] +ENTRYPOINT ["/entrypoint.sh", "uv", "run", "python", "main.py", "run.docker=false", "pqos.enable=false", "run.stressor=false", "run.interactive=false", "demo.demo_mode=false", "run.command=caterpillar"] diff --git a/cyclictest/Dockerfile b/cyclictest/Dockerfile index fdd74c3..afc5b49 100644 --- a/cyclictest/Dockerfile +++ b/cyclictest/Dockerfile @@ -42,4 +42,4 @@ RUN uv python install 3.12 && uv sync # Output directory for results RUN mkdir -p /tmp/output -ENTRYPOINT ["/entrypoint.sh", "uv", "run", "python", "main.py", "run.docker=false", "pqos.enable=false", "run.stressor=false", "run.interactive=false", "run.command=cyclictest"] +ENTRYPOINT ["/entrypoint.sh", "uv", "run", "python", "main.py", "run.docker=false", "pqos.enable=false", "run.stressor=false", "run.interactive=false", "demo.demo_mode=false", "run.command=cyclictest"] From 8fa54e9ea84867411ca0d6d25110efbe10efee78 Mon Sep 17 00:00:00 2001 From: Mikhail Malyshev Date: Sun, 1 Mar 2026 23:35:02 +0100 Subject: [PATCH 14/15] fix: improve isolcpus preflight check, remove cstate typo detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove the cmdline typo detector for 'rocessor.max_cstate' — it was fragile and produced false positives. - Parse isolcpus flags (managed_irq, domain, io_queue) properly instead of feeding them to _parse_cpulist which would crash on non-numeric tokens. - Warn when managed_irq or domain flags are missing — without them the kernel still schedules IRQs and tasks onto isolated cores. Note that when any flag is specified, 'domain' is no longer implied and must be listed explicitly. - Warn when io_queue flag is missing on kernel 6.17+ — this flag prevents block-layer IO completion queues from landing on isolated cores. - Support open-ended 1-N notation in CPU lists (meaning 'through the last available CPU'), as accepted by the kernel. - Make _parse_cpulist resilient to non-numeric tokens (skip instead of crashing with ValueError). --- src/rt_preflight.py | 166 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 140 insertions(+), 26 deletions(-) diff --git a/src/rt_preflight.py b/src/rt_preflight.py index ef8c074..ed8a082 100644 --- a/src/rt_preflight.py +++ b/src/rt_preflight.py @@ -113,21 +113,94 @@ def _cmdline_has(name: str) -> bool: return bool(re.search(rf"(?:^|\s){re.escape(name)}(?:=|\s|$)", _cmdline())) +def _max_cpu_id() -> int: + """Return the highest possible CPU id from the kernel. + + Reads ``/sys/devices/system/cpu/possible`` (e.g. ``0-127``). + Falls back to ``os.cpu_count() - 1``. + """ + possible = _read("/sys/devices/system/cpu/possible") + if possible: + # e.g. "0-127" — take the last number + m = re.search(r"(\d+)\s*$", possible) + if m: + return int(m.group(1)) + n = os.cpu_count() + return (n - 1) if n and n > 0 else 0 + + def _parse_cpulist(s: str) -> set[int]: - """Parse '1-3,5,7-9' into {1,2,3,5,7,8,9}.""" + """Parse '1-3,5,7-9' into {1,2,3,5,7,8,9}. + + Supports open-ended ranges: ``1-N`` means CPU 1 through the last + available CPU (read from ``/sys/devices/system/cpu/possible``). + + Non-numeric tokens (e.g. 'managed_irq', 'domain') are silently + skipped so that ``isolcpus=managed_irq,domain,2-5`` works. + """ cpus: set[int] = set() for part in s.split(","): part = part.strip() if not part: continue if "-" in part: - lo, hi = part.split("-", 1) - cpus.update(range(int(lo), int(hi) + 1)) + lo_s, hi_s = part.split("-", 1) + try: + lo = int(lo_s) + except ValueError: + continue + hi_s = hi_s.strip() + if hi_s.upper() == "N" or hi_s == "": + hi = _max_cpu_id() + else: + try: + hi = int(hi_s) + except ValueError: + continue + cpus.update(range(lo, hi + 1)) else: - cpus.add(int(part)) + try: + cpus.add(int(part)) + except ValueError: + continue return cpus +def _parse_isolcpus_flags(param: str) -> tuple[set[str], str]: + """Parse isolcpus value into (flags, cpu_list_str). + + isolcpus can be: + - ``isolcpus=2-5`` → flags={}, cpulist="2-5" + - ``isolcpus=managed_irq,2-5`` → flags={"managed_irq"}, cpulist="2-5" + - ``isolcpus=managed_irq,domain,io_queue,2-5`` + → flags={"managed_irq","domain","io_queue"}, cpulist="2-5" + """ + known_flags = {"managed_irq", "domain", "io_queue"} + flags: set[str] = set() + cpu_parts: list[str] = [] + + for token in param.split(","): + token = token.strip() + if token in known_flags: + flags.add(token) + elif token: + cpu_parts.append(token) + + return flags, ",".join(cpu_parts) + + +def _kernel_version() -> tuple[int, int]: + """Return (major, minor) kernel version from /proc/version. + + Falls back to (0, 0) if parsing fails. + """ + version_str = _read("/proc/version") or "" + m = re.search(r"Linux version (\d+)\.(\d+)", version_str) + if m: + return int(m.group(1)), int(m.group(2)) + return 0, 0 + + def _get_container_cpus() -> Optional[str]: """Read the cpuset that this container is allowed to use.""" for p in ( @@ -182,7 +255,7 @@ def check_preempt_rt() -> CheckResult: def check_isolcpus() -> CheckResult: - """Check that isolcpus is set and our container cpus are in the isolated set.""" + """Check that isolcpus is set with recommended flags and covers container CPUs.""" name = "CPU isolation (isolcpus)" isolated_param = _cmdline_param("isolcpus") @@ -196,33 +269,91 @@ def check_isolcpus() -> CheckResult: name, Status.WARN, "isolcpus not found in cmdline or sysfs" ) - isolated_set = _parse_cpulist(isolated_param) + # Parse flags and CPU list from isolcpus value + flags, cpulist_str = _parse_isolcpus_flags(isolated_param) + # If the value came from sysfs it's just a cpu list, no flags + isolated_set = ( + _parse_cpulist(cpulist_str) if cpulist_str else _parse_cpulist(isolated_param) + ) + warnings: list[str] = [] + + # Check for recommended flags. + # When any flags are present, 'domain' is NOT implied — it must be + # specified explicitly. Without flags isolcpus defaults to + # domain isolation, but adding e.g. managed_irq alone disables + # that default. + missing_flags: list[str] = [] + if "managed_irq" not in flags: + missing_flags.append("managed_irq") + if "domain" not in flags: + missing_flags.append("domain") + + if missing_flags: + warnings.append( + f"{', '.join(missing_flags)} flag(s) missing from isolcpus. " + "Note: when any flag is specified, 'domain' is no longer " + "implied and must be listed explicitly. Recommended: " + "isolcpus=managed_irq,domain," + ) + + kmaj, kmin = _kernel_version() + if (kmaj, kmin) >= (6, 17) and "io_queue" not in flags: + warnings.append( + f"io_queue flag missing (kernel {kmaj}.{kmin} supports it) — " + "block-layer IO completion queues may still land on isolated " + "cores. Recommended: isolcpus=managed_irq,domain,io_queue," + ) + + # Verify container CPUs are in the isolated set container_cpus_str = _get_container_cpus() if not container_cpus_str: + msg = f"isolcpus={isolated_param} (could not read container cpuset)" + if warnings: + return CheckResult( + name, + Status.WARN, + msg, + detail="\n".join(warnings), + ) return CheckResult( name, Status.PASS, - f"isolcpus={isolated_param} (could not read container cpuset)", + msg, detail="Cannot verify overlap — cgroup cpuset not readable", ) container_set = _parse_cpulist(container_cpus_str) not_isolated = container_set - isolated_set if not_isolated: + warnings.insert( + 0, + f"Container CPUs {sorted(not_isolated)} are NOT in " + f"isolcpus={isolated_param}", + ) return CheckResult( name, Status.WARN, f"Container CPUs {sorted(not_isolated)} are NOT in isolcpus={isolated_param}", detail=f"Container cpuset: {container_cpus_str}\n" - f"Isolated: {isolated_param}", + f"Isolated: {isolated_param}\n" + "\n".join(warnings), + ) + + detail = f"isolcpus={isolated_param}" + if warnings: + detail += "\n" + "\n".join(warnings) + return CheckResult( + name, + Status.WARN, + f"All container CPUs isolated but missing recommended flags", + detail=detail, ) return CheckResult( name, Status.PASS, f"All container CPUs ({container_cpus_str}) are isolated", - detail=f"isolcpus={isolated_param}", + detail=detail, ) @@ -360,24 +491,7 @@ def check_cstates() -> CheckResult: detail="\n".join(f" {k}={v}" for k, v in found.items()), ) - # Also check for the known typo - typo_params = ["rocessor.max_cstate", "rocessor_idle.max_cstate"] - typos_found = [] - for tp in typo_params: - if tp in cmdline: - typos_found.append(tp) - detail = "\n".join(f" {k}={v}" for k, v in found.items()) - if typos_found: - detail += f"\n WARNING: Possible typo in cmdline: {', '.join(typos_found)}" - detail += "\n (missing leading 'p' — parameter is being ignored by kernel)" - return CheckResult( - name, - Status.WARN, - "C-states look disabled but found typo in cmdline", - detail=detail, - ) - return CheckResult( name, Status.PASS, "All C-states disabled via kernel cmdline", detail=detail ) From 59d5a4123a354e115ca7e55cd1e33d57ac7d6fa2 Mon Sep 17 00:00:00 2001 From: Mikhail Malyshev Date: Wed, 4 Mar 2026 17:21:16 +0100 Subject: [PATCH 15/15] refactor: split Dockerfiles into regular (docker host) and .eve (EVE OS) variants Restore caterpillar/Dockerfile and cyclictest/Dockerfile to their original lightweight docker-host versions. The EVE OS standalone containers (with SSH, Jupyter, cpuset pinning, entrypoint) are now in Dockerfile.eve, Dockerfile.base.eve, caterpillar/Dockerfile.eve, and cyclictest/Dockerfile.eve. Rename build-all.sh to build-all-eve.sh and update it to reference the .eve Dockerfiles. Signed-off-by: Mikhail Malyshev --- Dockerfile.base | 238 +----------------------------- Dockerfile.base.eve | 242 +++++++++++++++++++++++++++++++ Dockerfile => Dockerfile.eve | 0 build-all.sh => build-all-eve.sh | 6 +- caterpillar/Dockerfile | 29 +--- caterpillar/Dockerfile.eve | 46 ++++++ cyclictest/Dockerfile | 31 +--- cyclictest/Dockerfile.eve | 45 ++++++ 8 files changed, 355 insertions(+), 282 deletions(-) create mode 100644 Dockerfile.base.eve rename Dockerfile => Dockerfile.eve (100%) rename build-all.sh => build-all-eve.sh (95%) create mode 100644 caterpillar/Dockerfile.eve create mode 100644 cyclictest/Dockerfile.eve diff --git a/Dockerfile.base b/Dockerfile.base index 97bee40..8cb97e6 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -1,242 +1,20 @@ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ - bash \ - bash-completion \ ca-certificates \ curl \ - git \ gnupg \ - less \ lsb-release \ - ncurses-base \ - ncurses-term \ - openssh-server \ - procps \ - wget \ - && rm -rf /var/lib/apt/lists/* - -# Shell environment -ENV SHELL=/bin/bash -ENV TERM=xterm-256color -ENV LANG=C.UTF-8 -ENV LC_ALL=C.UTF-8 - -# Login banner (motd) -RUN cat > /etc/motd <<'MOTD' - - ╔═════════════════════════════════════════════════════════════╗ - ║ RT Benchmarking Container ║ - ╠═════════════════════════════════════════════════════════════╣ - ║ ║ - ║ BENCHMARKS ║ - ║ Run benchmark: cd /app && uv run python main.py \ ║ - ║ run.docker=false pqos.enable=false ║ - ║ Pre-flight: cd /app && uv run python -m \ ║ - ║ src.rt_preflight ║ - ║ ║ - ║ JUPYTER NOTEBOOK ║ - ║ Start: jupyter-start ║ - ║ Then open: http://:8888 ║ - ║ ║ - ║ RESULTS ║ - ║ Output dir: /app/outputs/ ║ - ║ Notebooks: /app/notebook/ ║ - ║ SCP results: scp -i root@:/app/outputs . ║ - ║ ║ - ║ USEFUL COMMANDS ║ - ║ rt-preflight Run RT environment checks ║ - ║ rt-info Show detected CPUs and kernel info ║ - ║ ll List files (long format) ║ - ║ ║ - ╚═════════════════════════════════════════════════════════════╝ - -MOTD - -# Minimal but usable bashrc -RUN cat > /root/.bashrc <<'EOF' -# prompt -PS1='\[\e[1;32m\]\u@\h\[\e[0m\]:\[\e[1;34m\]\w\[\e[0m\]\$ ' - -# history -HISTSIZE=1000 -HISTFILESIZE=2000 -HISTCONTROL=ignoreboth -shopt -s histappend - -# usability -shopt -s checkwinsize - -# color ls -alias ls='ls --color=auto' -alias ll='ls -alF' -alias la='ls -A' -alias l='ls -CF' - -# RT tools aliases -jupyter-start() { - local ip - ip=$(hostname -I 2>/dev/null | awk '{print $1}') - ip=${ip:-0.0.0.0} - echo "" - echo " Starting Jupyter Notebook..." - echo " Connect at: http://${ip}:8888" - echo "" - cd /app && uv run jupyter notebook \ - --ip=0.0.0.0 \ - --port=8888 \ - --no-browser \ - --allow-root \ - --notebook-dir=/app/notebook \ - --ServerApp.custom_display_url="http://${ip}:8888" -} -alias rt-preflight='cd /app && uv run python -m src.rt_preflight' -alias rt-info='echo "=== Kernel ===" && uname -a && echo && echo "=== CPUs (cgroup) ===" && cat /sys/fs/cgroup/cpuset.cpus.effective 2>/dev/null || cat /sys/fs/cgroup/cpuset/cpuset.cpus 2>/dev/null || echo "N/A" && echo && echo "=== RT cmdline params ===" && cat /proc/cmdline | tr " " "\n" | grep -E "isolcpus|nohz|rcu_nocbs|irqaffinity|cstate|pstate|hugepages"' - -# bash completion -if [ -f /usr/share/bash-completion/bash_completion ]; then - . /usr/share/bash-completion/bash_completion -fi - -# show motd on interactive login -if [ -f /etc/motd ] && [ -t 0 ]; then - cat /etc/motd -fi -EOF - -RUN cp /root/.bashrc /etc/skel/.bashrc - -# SSH server configuration -ARG SSH_KEY -RUN mkdir -p /run/sshd /root/.ssh && \ - chmod 700 /root/.ssh && \ - ssh-keygen -A && \ - sed -i 's/^#\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config && \ - sed -i 's/^#\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config && \ - sed -i 's/^#\?PubkeyAuthentication.*/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \ - sed -i 's/^#\?UsePAM.*/UsePAM no/' /etc/ssh/sshd_config && \ - echo "${SSH_KEY}" > /root/.ssh/authorized_keys && \ - chmod 600 /root/.ssh/authorized_keys - -EXPOSE 22 8888 - -# Entrypoint script: start sshd, run CMD, keep container alive -RUN cat > /entrypoint.sh <<'ENTRY' -#!/bin/bash - -# --- Detect cpuset and split into housekeeping vs benchmark cores --- -CPUSET="" -for f in /sys/fs/cgroup/cpuset.cpus.effective \ - /sys/fs/cgroup/cpuset/cpuset.cpus \ - /sys/fs/cgroup/cpuset/cpuset.effective_cpus; do - if [ -f "$f" ]; then - CPUSET=$(cat "$f") - break - fi -done - -# Expand cpuset string (e.g. "2,4-6") into a sorted list of cores -expand_cpuset() { - local result="" - IFS=',' read -ra parts <<< "$1" - for part in "${parts[@]}"; do - if [[ "$part" == *-* ]]; then - IFS='-' read -r lo hi <<< "$part" - for ((i=lo; i<=hi; i++)); do - result="$result $i" - done - else - result="$result $part" - fi - done - echo "$result" | tr ' ' '\n' | sort -n | tr '\n' ' ' -} - -ALL_CORES=( $(expand_cpuset "$CPUSET") ) -NUM_CORES=${#ALL_CORES[@]} - -if [ "$NUM_CORES" -ge 2 ]; then - # First core is housekeeping, rest are for benchmarks - HOUSEKEEPING_CORE=${ALL_CORES[0]} - BENCHMARK_CORES=$(IFS=,; echo "${ALL_CORES[*]:1}") - echo "CPU layout: housekeeping=${HOUSEKEEPING_CORE} benchmark=${BENCHMARK_CORES} (from cpuset: ${CPUSET})" -else - # Only one core, everything shares it - HOUSEKEEPING_CORE=${ALL_CORES[0]:-0} - BENCHMARK_CORES=${ALL_CORES[0]:-0} - echo "WARNING: Only ${NUM_CORES} core(s) available (${CPUSET}), no isolation possible" -fi - -# Export for child processes (main.py can read these) -export RT_HOUSEKEEPING_CORE="$HOUSEKEEPING_CORE" -export RT_BENCHMARK_CORES="$BENCHMARK_CORES" - -# Pin this shell (and all children: sshd, python, uv) to housekeeping core -taskset -pc "$HOUSEKEEPING_CORE" $$ 2>/dev/null || true - -# --- Start services pinned to housekeeping core --- -mkdir -p /run/sshd -echo "Starting SSH server on core ${HOUSEKEEPING_CORE}..." -taskset -c "$HOUSEKEEPING_CORE" /usr/sbin/sshd -e || echo "WARNING: sshd failed to start (exit code $?)" - -IP=$(hostname -I 2>/dev/null | awk '{print $1}') -IP=${IP:-} - -echo "" -echo "============================================================" -echo " RT Benchmarking Container" -echo "============================================================" -echo "" -echo " Housekeeping core: ${HOUSEKEEPING_CORE}" -echo " Benchmark cores: ${BENCHMARK_CORES}" -echo "" -echo " SSH: ssh -i root@${IP}" -echo " Jupyter: ssh in, then run: jupyter-start" -echo " or from this console:" -echo " cd /app && uv run jupyter notebook --ip=0.0.0.0 \\" -echo " --port=8888 --no-browser --allow-root \\" -echo " --notebook-dir=/app/notebook" -echo " then open: http://${IP}:8888" -echo "" -echo " Results: /app/outputs/" -echo " Notebooks: /app/notebook/" -echo "============================================================" -echo "" - -if [ $# -gt 0 ]; then - echo "Running: $@" - "$@" - EXIT_CODE=$? - echo "" - echo "============================================================" - echo " Command finished with exit code ${EXIT_CODE}" - echo "" - echo " Jupyter: http://${IP}:8888" - echo " To start: ssh root@${IP} then run: jupyter-start" - echo " SSH: ssh -i root@${IP}" - echo " Results: /app/outputs/" - echo "============================================================" - echo "" -fi - -echo "Container staying alive (sshd running on core ${HOUSEKEEPING_CORE}). SSH in or ctrl-c to stop." -sleep infinity -ENTRY -RUN chmod +x /entrypoint.sh - -# Intel and ECI repository keys and sources + wget RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null RUN wget -O- https://eci.intel.com/repos/gpg-keys/GPG-PUB-KEY-INTEL-ECI.gpg | tee /usr/share/keyrings/eci-archive-keyring.gpg > /dev/null RUN wget -O- https://raw.githubusercontent.com/ros/rosdistro/master/ros.key | tee /usr/share/keyrings/ros-archive-keyring.gpg > /dev/null RUN . /etc/os-release \ - && echo $VERSION_CODENAME && \ - bash -c 'echo "deb [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/$(source /etc/os-release && echo $VERSION_CODENAME) isar main" | tee /etc/apt/sources.list.d/eci.list' && \ - bash -c 'echo "deb-src [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/$(source /etc/os-release && echo $VERSION_CODENAME) isar main" | tee -a /etc/apt/sources.list.d/eci.list' && \ - bash -c 'echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list' && \ - bash -c 'echo -e "Package: intel-oneapi-runtime-*\nPin: version 2024.1.*\nPin-Priority: 1001" > /etc/apt/preferences.d/oneapi' && \ - bash -c 'echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/openvino/2024 ubuntu24 main" > /etc/apt/sources.list.d/intel-openvino-2024.list' && \ - bash -c 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/ros-archive-keyring.gpg] http://packages.ros.org/ros2/ubuntu $(source /etc/os-release && echo $VERSION_CODENAME) main" | tee /etc/apt/sources.list.d/ros2.list' +&& echo $VERSION_CODENAME && \ +bash -c 'echo "deb [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/$(source /etc/os-release && echo $VERSION_CODENAME) isar main" | tee /etc/apt/sources.list.d/eci.list' && \ +bash -c 'echo "deb-src [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/$(source /etc/os-release && echo $VERSION_CODENAME) isar main" | tee -a /etc/apt/sources.list.d/eci.list' && \ +bash -c 'echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list' && \ +bash -c 'echo -e "Package: intel-oneapi-runtime-*\nPin: version 2024.1.*\nPin-Priority: 1001" > /etc/apt/preferences.d/oneapi' && \ +bash -c 'echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/openvino/2024 ubuntu24 main" > /etc/apt/sources.list.d/intel-openvino-2024.list' && \ +bash -c 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/ros-archive-keyring.gpg] http://packages.ros.org/ros2/ubuntu $(source /etc/os-release && echo $VERSION_CODENAME) main" | tee /etc/apt/sources.list.d/ros2.list' RUN apt-get update && apt-get install -y intel-cmt-cat - -ENTRYPOINT ["/entrypoint.sh"] -CMD ["/bin/bash", "-l"] diff --git a/Dockerfile.base.eve b/Dockerfile.base.eve new file mode 100644 index 0000000..97bee40 --- /dev/null +++ b/Dockerfile.base.eve @@ -0,0 +1,242 @@ +FROM debian:bookworm-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \ + bash \ + bash-completion \ + ca-certificates \ + curl \ + git \ + gnupg \ + less \ + lsb-release \ + ncurses-base \ + ncurses-term \ + openssh-server \ + procps \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# Shell environment +ENV SHELL=/bin/bash +ENV TERM=xterm-256color +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +# Login banner (motd) +RUN cat > /etc/motd <<'MOTD' + + ╔═════════════════════════════════════════════════════════════╗ + ║ RT Benchmarking Container ║ + ╠═════════════════════════════════════════════════════════════╣ + ║ ║ + ║ BENCHMARKS ║ + ║ Run benchmark: cd /app && uv run python main.py \ ║ + ║ run.docker=false pqos.enable=false ║ + ║ Pre-flight: cd /app && uv run python -m \ ║ + ║ src.rt_preflight ║ + ║ ║ + ║ JUPYTER NOTEBOOK ║ + ║ Start: jupyter-start ║ + ║ Then open: http://:8888 ║ + ║ ║ + ║ RESULTS ║ + ║ Output dir: /app/outputs/ ║ + ║ Notebooks: /app/notebook/ ║ + ║ SCP results: scp -i root@:/app/outputs . ║ + ║ ║ + ║ USEFUL COMMANDS ║ + ║ rt-preflight Run RT environment checks ║ + ║ rt-info Show detected CPUs and kernel info ║ + ║ ll List files (long format) ║ + ║ ║ + ╚═════════════════════════════════════════════════════════════╝ + +MOTD + +# Minimal but usable bashrc +RUN cat > /root/.bashrc <<'EOF' +# prompt +PS1='\[\e[1;32m\]\u@\h\[\e[0m\]:\[\e[1;34m\]\w\[\e[0m\]\$ ' + +# history +HISTSIZE=1000 +HISTFILESIZE=2000 +HISTCONTROL=ignoreboth +shopt -s histappend + +# usability +shopt -s checkwinsize + +# color ls +alias ls='ls --color=auto' +alias ll='ls -alF' +alias la='ls -A' +alias l='ls -CF' + +# RT tools aliases +jupyter-start() { + local ip + ip=$(hostname -I 2>/dev/null | awk '{print $1}') + ip=${ip:-0.0.0.0} + echo "" + echo " Starting Jupyter Notebook..." + echo " Connect at: http://${ip}:8888" + echo "" + cd /app && uv run jupyter notebook \ + --ip=0.0.0.0 \ + --port=8888 \ + --no-browser \ + --allow-root \ + --notebook-dir=/app/notebook \ + --ServerApp.custom_display_url="http://${ip}:8888" +} +alias rt-preflight='cd /app && uv run python -m src.rt_preflight' +alias rt-info='echo "=== Kernel ===" && uname -a && echo && echo "=== CPUs (cgroup) ===" && cat /sys/fs/cgroup/cpuset.cpus.effective 2>/dev/null || cat /sys/fs/cgroup/cpuset/cpuset.cpus 2>/dev/null || echo "N/A" && echo && echo "=== RT cmdline params ===" && cat /proc/cmdline | tr " " "\n" | grep -E "isolcpus|nohz|rcu_nocbs|irqaffinity|cstate|pstate|hugepages"' + +# bash completion +if [ -f /usr/share/bash-completion/bash_completion ]; then + . /usr/share/bash-completion/bash_completion +fi + +# show motd on interactive login +if [ -f /etc/motd ] && [ -t 0 ]; then + cat /etc/motd +fi +EOF + +RUN cp /root/.bashrc /etc/skel/.bashrc + +# SSH server configuration +ARG SSH_KEY +RUN mkdir -p /run/sshd /root/.ssh && \ + chmod 700 /root/.ssh && \ + ssh-keygen -A && \ + sed -i 's/^#\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config && \ + sed -i 's/^#\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config && \ + sed -i 's/^#\?PubkeyAuthentication.*/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \ + sed -i 's/^#\?UsePAM.*/UsePAM no/' /etc/ssh/sshd_config && \ + echo "${SSH_KEY}" > /root/.ssh/authorized_keys && \ + chmod 600 /root/.ssh/authorized_keys + +EXPOSE 22 8888 + +# Entrypoint script: start sshd, run CMD, keep container alive +RUN cat > /entrypoint.sh <<'ENTRY' +#!/bin/bash + +# --- Detect cpuset and split into housekeeping vs benchmark cores --- +CPUSET="" +for f in /sys/fs/cgroup/cpuset.cpus.effective \ + /sys/fs/cgroup/cpuset/cpuset.cpus \ + /sys/fs/cgroup/cpuset/cpuset.effective_cpus; do + if [ -f "$f" ]; then + CPUSET=$(cat "$f") + break + fi +done + +# Expand cpuset string (e.g. "2,4-6") into a sorted list of cores +expand_cpuset() { + local result="" + IFS=',' read -ra parts <<< "$1" + for part in "${parts[@]}"; do + if [[ "$part" == *-* ]]; then + IFS='-' read -r lo hi <<< "$part" + for ((i=lo; i<=hi; i++)); do + result="$result $i" + done + else + result="$result $part" + fi + done + echo "$result" | tr ' ' '\n' | sort -n | tr '\n' ' ' +} + +ALL_CORES=( $(expand_cpuset "$CPUSET") ) +NUM_CORES=${#ALL_CORES[@]} + +if [ "$NUM_CORES" -ge 2 ]; then + # First core is housekeeping, rest are for benchmarks + HOUSEKEEPING_CORE=${ALL_CORES[0]} + BENCHMARK_CORES=$(IFS=,; echo "${ALL_CORES[*]:1}") + echo "CPU layout: housekeeping=${HOUSEKEEPING_CORE} benchmark=${BENCHMARK_CORES} (from cpuset: ${CPUSET})" +else + # Only one core, everything shares it + HOUSEKEEPING_CORE=${ALL_CORES[0]:-0} + BENCHMARK_CORES=${ALL_CORES[0]:-0} + echo "WARNING: Only ${NUM_CORES} core(s) available (${CPUSET}), no isolation possible" +fi + +# Export for child processes (main.py can read these) +export RT_HOUSEKEEPING_CORE="$HOUSEKEEPING_CORE" +export RT_BENCHMARK_CORES="$BENCHMARK_CORES" + +# Pin this shell (and all children: sshd, python, uv) to housekeeping core +taskset -pc "$HOUSEKEEPING_CORE" $$ 2>/dev/null || true + +# --- Start services pinned to housekeeping core --- +mkdir -p /run/sshd +echo "Starting SSH server on core ${HOUSEKEEPING_CORE}..." +taskset -c "$HOUSEKEEPING_CORE" /usr/sbin/sshd -e || echo "WARNING: sshd failed to start (exit code $?)" + +IP=$(hostname -I 2>/dev/null | awk '{print $1}') +IP=${IP:-} + +echo "" +echo "============================================================" +echo " RT Benchmarking Container" +echo "============================================================" +echo "" +echo " Housekeeping core: ${HOUSEKEEPING_CORE}" +echo " Benchmark cores: ${BENCHMARK_CORES}" +echo "" +echo " SSH: ssh -i root@${IP}" +echo " Jupyter: ssh in, then run: jupyter-start" +echo " or from this console:" +echo " cd /app && uv run jupyter notebook --ip=0.0.0.0 \\" +echo " --port=8888 --no-browser --allow-root \\" +echo " --notebook-dir=/app/notebook" +echo " then open: http://${IP}:8888" +echo "" +echo " Results: /app/outputs/" +echo " Notebooks: /app/notebook/" +echo "============================================================" +echo "" + +if [ $# -gt 0 ]; then + echo "Running: $@" + "$@" + EXIT_CODE=$? + echo "" + echo "============================================================" + echo " Command finished with exit code ${EXIT_CODE}" + echo "" + echo " Jupyter: http://${IP}:8888" + echo " To start: ssh root@${IP} then run: jupyter-start" + echo " SSH: ssh -i root@${IP}" + echo " Results: /app/outputs/" + echo "============================================================" + echo "" +fi + +echo "Container staying alive (sshd running on core ${HOUSEKEEPING_CORE}). SSH in or ctrl-c to stop." +sleep infinity +ENTRY +RUN chmod +x /entrypoint.sh + +# Intel and ECI repository keys and sources +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null +RUN wget -O- https://eci.intel.com/repos/gpg-keys/GPG-PUB-KEY-INTEL-ECI.gpg | tee /usr/share/keyrings/eci-archive-keyring.gpg > /dev/null +RUN wget -O- https://raw.githubusercontent.com/ros/rosdistro/master/ros.key | tee /usr/share/keyrings/ros-archive-keyring.gpg > /dev/null +RUN . /etc/os-release \ + && echo $VERSION_CODENAME && \ + bash -c 'echo "deb [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/$(source /etc/os-release && echo $VERSION_CODENAME) isar main" | tee /etc/apt/sources.list.d/eci.list' && \ + bash -c 'echo "deb-src [signed-by=/usr/share/keyrings/eci-archive-keyring.gpg] https://eci.intel.com/repos/$(source /etc/os-release && echo $VERSION_CODENAME) isar main" | tee -a /etc/apt/sources.list.d/eci.list' && \ + bash -c 'echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list' && \ + bash -c 'echo -e "Package: intel-oneapi-runtime-*\nPin: version 2024.1.*\nPin-Priority: 1001" > /etc/apt/preferences.d/oneapi' && \ + bash -c 'echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/openvino/2024 ubuntu24 main" > /etc/apt/sources.list.d/intel-openvino-2024.list' && \ + bash -c 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/ros-archive-keyring.gpg] http://packages.ros.org/ros2/ubuntu $(source /etc/os-release && echo $VERSION_CODENAME) main" | tee /etc/apt/sources.list.d/ros2.list' +RUN apt-get update && apt-get install -y intel-cmt-cat + +ENTRYPOINT ["/entrypoint.sh"] +CMD ["/bin/bash", "-l"] diff --git a/Dockerfile b/Dockerfile.eve similarity index 100% rename from Dockerfile rename to Dockerfile.eve diff --git a/build-all.sh b/build-all-eve.sh similarity index 95% rename from build-all.sh rename to build-all-eve.sh index d0bc4cb..2f869f7 100755 --- a/build-all.sh +++ b/build-all-eve.sh @@ -15,9 +15,9 @@ SSH_KEY_CONTENT="$(cat "$SSH_KEY")" echo "Using SSH public key: ${SSH_KEY}" IMAGES=( - "eci-base:Dockerfile.base:." - "caterpillar:caterpillar/Dockerfile:." - "cyclictest:cyclictest/Dockerfile:." + "eci-base:Dockerfile.base.eve:." + "caterpillar:caterpillar/Dockerfile.eve:." + "cyclictest:cyclictest/Dockerfile.eve:." ) SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" diff --git a/caterpillar/Dockerfile b/caterpillar/Dockerfile index 3d1eaff..4696add 100644 --- a/caterpillar/Dockerfile +++ b/caterpillar/Dockerfile @@ -13,34 +13,17 @@ # License. # -ARG BASE_TAG=latest -FROM eci-base:${BASE_TAG} - -# Install caterpillar benchmark +FROM eci-base:latest +# Install required dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ - caterpillar \ - && rm -rf /var/lib/apt/lists/* + caterpillar \ + && rm -rf /var/lib/apt/lists/* ENV USER=root RUN chmod +x /opt/benchmarking/caterpillar/caterpillar -# Install uv package manager -RUN curl -LsSf https://astral.sh/uv/install.sh | sh -ENV PATH="/root/.local/bin:$PATH" - -WORKDIR /app - -# Copy project files -COPY pyproject.toml uv.lock* ./ -COPY src/ ./src/ -COPY conf/ ./conf/ -COPY notebook/ ./notebook/ -COPY main.py ./ - -# Install Python 3.12 and sync dependencies -RUN uv python install 3.12 && uv sync +WORKDIR /opt/benchmarking/caterpillar -# Output directory for results +# example directory to share file (-v output:/tmp/output) RUN mkdir -p /tmp/output -ENTRYPOINT ["/entrypoint.sh", "uv", "run", "python", "main.py", "run.docker=false", "pqos.enable=false", "run.stressor=false", "run.interactive=false", "demo.demo_mode=false", "run.command=caterpillar"] diff --git a/caterpillar/Dockerfile.eve b/caterpillar/Dockerfile.eve new file mode 100644 index 0000000..3d1eaff --- /dev/null +++ b/caterpillar/Dockerfile.eve @@ -0,0 +1,46 @@ +# INTEL CONFIDENTIAL +# +# Copyright (c) Intel Corporation, 2025 +# +# This software and the related documents are Intel copyrighted materials, and +# your use of them is governed by the express license under which they were +# provided to you (License). Unless the License provides otherwise, you may +# not use, modify, copy, publish, distribute, disclose or transmit this +# software or the related documents without Intel's prior written permission. +# +# This software and the related documents are provided as is, with no express +# or implied warranties, other than those that are expressly stated in the +# License. +# + +ARG BASE_TAG=latest +FROM eci-base:${BASE_TAG} + +# Install caterpillar benchmark +RUN apt-get update && apt-get install -y --no-install-recommends \ + caterpillar \ + && rm -rf /var/lib/apt/lists/* + +ENV USER=root +RUN chmod +x /opt/benchmarking/caterpillar/caterpillar + +# Install uv package manager +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +WORKDIR /app + +# Copy project files +COPY pyproject.toml uv.lock* ./ +COPY src/ ./src/ +COPY conf/ ./conf/ +COPY notebook/ ./notebook/ +COPY main.py ./ + +# Install Python 3.12 and sync dependencies +RUN uv python install 3.12 && uv sync + +# Output directory for results +RUN mkdir -p /tmp/output + +ENTRYPOINT ["/entrypoint.sh", "uv", "run", "python", "main.py", "run.docker=false", "pqos.enable=false", "run.stressor=false", "run.interactive=false", "demo.demo_mode=false", "run.command=caterpillar"] diff --git a/cyclictest/Dockerfile b/cyclictest/Dockerfile index afc5b49..e8ebd10 100644 --- a/cyclictest/Dockerfile +++ b/cyclictest/Dockerfile @@ -13,33 +13,12 @@ # License. # -ARG BASE_TAG=latest -FROM eci-base:${BASE_TAG} - -# Install cyclictest benchmark +FROM eci-base:latest +# Install required dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ - rt-tests \ - && rm -rf /var/lib/apt/lists/* - + rt-tests\ + && rm -rf /var/lib/apt/lists/* ENV USER=root - -# Install uv package manager -RUN curl -LsSf https://astral.sh/uv/install.sh | sh -ENV PATH="/root/.local/bin:$PATH" - -WORKDIR /app - -# Copy project files -COPY pyproject.toml uv.lock* ./ -COPY src/ ./src/ -COPY conf/ ./conf/ -COPY notebook/ ./notebook/ -COPY main.py ./ - -# Install Python 3.12 and sync dependencies -RUN uv python install 3.12 && uv sync - -# Output directory for results +# example directory to share file (-v output:/tmp/output) RUN mkdir -p /tmp/output -ENTRYPOINT ["/entrypoint.sh", "uv", "run", "python", "main.py", "run.docker=false", "pqos.enable=false", "run.stressor=false", "run.interactive=false", "demo.demo_mode=false", "run.command=cyclictest"] diff --git a/cyclictest/Dockerfile.eve b/cyclictest/Dockerfile.eve new file mode 100644 index 0000000..afc5b49 --- /dev/null +++ b/cyclictest/Dockerfile.eve @@ -0,0 +1,45 @@ +# INTEL CONFIDENTIAL +# +# Copyright (c) Intel Corporation, 2025 +# +# This software and the related documents are Intel copyrighted materials, and +# your use of them is governed by the express license under which they were +# provided to you (License). Unless the License provides otherwise, you may +# not use, modify, copy, publish, distribute, disclose or transmit this +# software or the related documents without Intel's prior written permission. +# +# This software and the related documents are provided as is, with no express +# or implied warranties, other than those that are expressly stated in the +# License. +# + +ARG BASE_TAG=latest +FROM eci-base:${BASE_TAG} + +# Install cyclictest benchmark +RUN apt-get update && apt-get install -y --no-install-recommends \ + rt-tests \ + && rm -rf /var/lib/apt/lists/* + +ENV USER=root + +# Install uv package manager +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +WORKDIR /app + +# Copy project files +COPY pyproject.toml uv.lock* ./ +COPY src/ ./src/ +COPY conf/ ./conf/ +COPY notebook/ ./notebook/ +COPY main.py ./ + +# Install Python 3.12 and sync dependencies +RUN uv python install 3.12 && uv sync + +# Output directory for results +RUN mkdir -p /tmp/output + +ENTRYPOINT ["/entrypoint.sh", "uv", "run", "python", "main.py", "run.docker=false", "pqos.enable=false", "run.stressor=false", "run.interactive=false", "demo.demo_mode=false", "run.command=cyclictest"]