# Partitioning workload benchmarks

This notebook regenerates the deterministic partitioning scenarios used in the paper. It executes each workload sweep under the shared memory budget, persists the raw CSV/JSON tables in `benchmarks/results/`, and creates the publication summaries and figures inline.

The workflow relies on the benchmark helpers introduced alongside the partitioning experiments:

1. Resolve the repository root and add `benchmarks/` to `sys.path`.
2. Import the scenario registry plus the runner utilities.
3. Execute each scenario with a single repetition and a 256 MiB memory budget, mirroring the CLI invocations documented in ``benchmarks/README.md``.
4. Write the raw results (`.csv`/`.json`) and summary tables, then render a runtime comparison figure for the paper.

In [None]:
from __future__ import annotations

import sys
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Markdown, display

def find_repo_root(start: Path) -> Path:
    for parent in [start, *start.parents]:
        if (parent / 'pyproject.toml').exists():
            return parent
    raise RuntimeError('unable to locate project root')

REPO_ROOT = find_repo_root(Path.cwd())
BENCHMARK_ROOT = REPO_ROOT / 'benchmarks'
if str(BENCHMARK_ROOT) not in sys.path:
    sys.path.insert(0, str(BENCHMARK_ROOT))

from partitioning_workloads import SCENARIOS, iter_scenario
from run_benchmark import (
    run_scenarios,
    save_results,
    write_partitioning_tables,
    summarise_partitioning,
)
from plot_utils import setup_benchmark_style, plot_quasar_vs_baseline_best


In [None]:
RESULTS_DIR = REPO_ROOT / 'benchmarks' / 'results'
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
MEMORY_BUDGET = 256 * 1024 * 1024  # 256 MiB
REPETITIONS = 1

setup_benchmark_style(context='paper', font_scale=1.0)


In [None]:
scenario_results: dict[str, pd.DataFrame] = {}
scenario_summaries: dict[str, pd.DataFrame] = {}

for name in sorted(SCENARIOS):
    display(Markdown(f"## Scenario: {name.replace('_', ' ').title()}"))
    instances = iter_scenario(name)
    df = run_scenarios(instances, REPETITIONS, memory_bytes=MEMORY_BUDGET)
    base_path = RESULTS_DIR / name
    save_results(df, base_path)
    write_partitioning_tables(df, base_path)
    summary = summarise_partitioning(df)
    scenario_results[name] = df
    scenario_summaries[name] = summary
    display(Markdown('### Raw results'))
    display(df)
    display(Markdown('### Summary table'))
    display(summary)
    fig, ax = plt.subplots(figsize=(6.5, 4.0))
    plot_quasar_vs_baseline_best(
        df[df['scenario'] == name],
        metric='run_time_mean',
        annotate_backend=True,
        ax=ax,
        log_scale=False,
    )
    ax.set_title(f"{name.replace('_', ' ').title()} runtime comparison")
    ax.set_ylabel('Runtime (s)')
    plt.show()


In [None]:
combined_summary = pd.concat(scenario_summaries.values(), ignore_index=True)
display(Markdown('## Combined summary'))
display(combined_summary)
