In [1]:
!make docker-image > /dev/null 2>&1

In [2]:
!CONTAINER_CMD="bash -lc 'make install-ycsb" make docker > /dev/null 2>&1

In [3]:
from pathlib import Path
import pexpect
import os
import time

""" Collector class has helper methods to interact with kermit"""
class Collector: 
    def __init__(self, config: Path):
        self.env = os.environ.copy()
        self.env["INTERACTIVE"] = "it"
        self.env["CONTAINER_CMD"] = f"bash -lc 'KERNMLOPS_CONFIG_FILE={config} make collect-data'"
        self.collect_process : pexpect.spawn | None = None

    def start_collection(self, logfile=None):
        self.collect_process = pexpect.spawn("make docker", env=self.env, timeout=None, logfile=logfile)
        self.collect_process.expect_exact(["Started benchmark"])

    def _after_run_generate_file_data() -> dict[str, list[Path]]:
        start_path : Path = Path("./data")
        list_of_collect_id_dirs = start_path.glob("*/*/*")
        latest_collect_id = max(list_of_collect_id_dirs, key=os.path.getctime)
        list_of_files = latest_collect_id.glob("*.*.parquet")
        output = {}
        for f in list_of_files:
            index = str(f).removeprefix(str(f.parent) + "/").split(".")[0]
            if index not in output.keys():
                output[index] = []
            output[index].append(f)
        return output
        
    def wait(self) -> int:
        if self.collect_process is None:
            return
        self.collect_process.expect([pexpect.EOF])
        self.collect_process.wait()
        return Collector._after_run_generate_file_data()
        
    def stop_collection(self):
        if self.collect_process is None:
            return
        self.collect_process.sendline("END")
        return self.wait()

There are two ways to run kermit:
- With the raw config
- With a pre-programmed benchmark config

In [4]:
# New Page Fault Collector
collect = Collector("./config/raw_overrides.yaml")
log = open("page_fault_log.txt", "bw")
    
collect.start_collection(logfile=log)

# Run a program that causes exactly 1 page fault in user space
!./page_fault

data = collect.stop_collection()
log.close()

# Check what was collected
# print("Available keys:", data.keys())
# print(data)

# Read the log to see if there were errors
# with open("page_fault_log.txt", "r") as f:
#    print("Log contents:", f.read())

# Analyze results
import polars as pl
df = pl.read_parquet(data["page_fault"])
# print(df.head())
print(f"Total faults: {len(df)}")
print(f"Major faults: {df.filter(pl.col('is_major')).height}")
# Check fault patterns
df_filtered = df.filter(
    (pl.col('comm') == 'page_fault') & 
    (pl.col('is_major') == True)
)
print(f"Major faults for page_fault app: {len(df_filtered)}")

major_summary = df.filter(pl.col('is_major')).group_by('comm').len()
print(major_summary)

Process PID: 51123
----------------------------
Initial - Major (hard) page faults: 1, Minor (soft) page faults: 609
Cannot drop caches (not root). Hard page fault unlikely.
Before access - Major (hard) page faults: 1, Minor (soft) page faults: 611
After access - Major (hard) page faults: 2, Minor (soft) page faults: 611

Read value: A

For best results ensuring a hard page fault:
1. Run: ./page_fault create
2. Run: echo 3 | sudo tee /proc/sys/vm/drop_caches
3. Run: ./page_fault access
Total faults: 14109
Major faults: 126
Major faults for page_fault app: 7
shape: (10, 2)
┌─────────────┬─────┐
│ comm        ┆ len │
│ ---         ┆ --- │
│ str         ┆ u32 │
╞═════════════╪═════╡
│ cat         ┆ 28  │
│ sed         ┆ 20  │
│ ps          ┆ 19  │
│ python      ┆ 1   │
│ bash        ┆ 8   │
│ sleep       ┆ 7   │
│ cpuUsage.sh ┆ 8   │
│ sh          ┆ 21  │
│ page_fault  ┆ 7   │
│ which       ┆ 7   │
└─────────────┴─────┘


In [5]:
collect = Collector("./config/redis_never.yaml")
collect.start_collection(None)
data = collect.stop_collection()

In [6]:
# Analyze TCP branches
import polars as pl
df = pl.read_parquet(data["page_fault"])

print(df.group_by("comm").count().sort("count", descending=True))

print(f"Total faults: {len(df)}")
print(f"Major faults: {df.filter(pl.col('is_major')).height}")
# Check fault patterns
df_filtered = df.filter(
    (pl.col('comm') == 'redis-server') & 
    (pl.col('is_major') == True)
)
print(f"Major faults for redis-server app: {len(df_filtered)}")

major_summary = df.filter(pl.col('is_major')).group_by('comm').count().sort("count", descending=True)
print(major_summary)

shape: (67, 2)
┌─────────────────┬────────┐
│ comm            ┆ count  │
│ ---             ┆ ---    │
│ str             ┆ u32    │
╞═════════════════╪════════╡
│ python          ┆ 162614 │
│ C2 CompilerThre ┆ 113542 │
│ java            ┆ 105448 │
│ C1 CompilerThre ┆ 20956  │
│ Thread-2        ┆ 13635  │
│ …               ┆ …      │
│ dockerd         ┆ 8      │
│ VM Periodic Tas ┆ 6      │
│ docker          ┆ 5      │
│ Service Thread  ┆ 5      │
│ jemalloc_bg_thd ┆ 4      │
└─────────────────┴────────┘
Total faults: 480784
Major faults: 10248
Major faults for redis-server app: 68
shape: (22, 2)
┌─────────────────┬───────┐
│ comm            ┆ count │
│ ---             ┆ ---   │
│ str             ┆ u32   │
╞═════════════════╪═══════╡
│ java            ┆ 8755  │
│ cat             ┆ 266   │
│ sh              ┆ 213   │
│ sed             ┆ 190   │
│ ps              ┆ 182   │
│ …               ┆ …     │
│ uname           ┆ 12    │
│ tmcc.bin        ┆ 7     │
│ C2 CompilerThre ┆ 3     │
│ Thre

  print(df.group_by("comm").count().sort("count", descending=True))
  major_summary = df.filter(pl.col('is_major')).group_by('comm').count().sort("count", descending=True)


In [7]:
# Create collector with XSBench configuration
collect = Collector("./config/xsbench.yaml")

# Start collection and run XSBench
print("Starting collection with XSBench workload...")
collect.start_collection(None)

# Wait for XSBench to complete
print("Running XSBench benchmark...")
data = collect.stop_collection()

Starting collection with XSBench workload...
Running XSBench benchmark...


In [8]:
df = pl.read_parquet(data["page_fault"])

print(df.group_by("comm").count().sort("count", descending=True))

print(f"Total faults: {len(df)}")
print(f"Major faults: {df.filter(pl.col('is_major')).height}")
# Check fault patterns
df_filtered = df.filter(
    (pl.col('comm') == 'XSBench') & 
    (pl.col('is_major') == True)
)
print(f"Major faults for XSBench app: {len(df_filtered)}")

major_summary = df.filter(pl.col('is_major')).group_by('comm').count().sort("count", descending=True)
print(major_summary)

shape: (15, 2)
┌─────────────────┬───────┐
│ comm            ┆ count │
│ ---             ┆ ---   │
│ str             ┆ u32   │
╞═════════════════╪═══════╡
│ python          ┆ 14589 │
│ XSBench         ┆ 1211  │
│ cpuUsage.sh     ┆ 991   │
│ node            ┆ 774   │
│ ps              ┆ 409   │
│ …               ┆ …     │
│ systemd-journal ┆ 78    │
│ sleep           ┆ 76    │
│ which           ┆ 70    │
│ jupyter-noteboo ┆ 4     │
│ systemd         ┆ 4     │
└─────────────────┴───────┘
Total faults: 19283
Major faults: 135
Major faults for XSBench app: 7
shape: (10, 2)
┌─────────────┬───────┐
│ comm        ┆ count │
│ ---         ┆ ---   │
│ str         ┆ u32   │
╞═════════════╪═══════╡
│ cat         ┆ 28    │
│ sh          ┆ 21    │
│ sed         ┆ 20    │
│ sshd        ┆ 19    │
│ ps          ┆ 17    │
│ cpuUsage.sh ┆ 8     │
│ which       ┆ 7     │
│ XSBench     ┆ 7     │
│ sleep       ┆ 7     │
│ python      ┆ 1     │
└─────────────┴───────┘


  print(df.group_by("comm").count().sort("count", descending=True))
  major_summary = df.filter(pl.col('is_major')).group_by('comm').count().sort("count", descending=True)
