# Profiling of `maze_dataset` serializing/loading/saving/reading

In [None]:
import itertools
from typing import Callable, Any
import copy
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from muutils.statcounter import StatCounter
from muutils.timeit_fancy import timeit_fancy, FancyTimeitResult

from maze_dataset import (
    MazeDataset,
    MazeDatasetConfig,
    set_serialize_minimal_threshold,
)
from maze_dataset.generation.generators import GENERATORS_MAP

## Generate Datasets


In [None]:
cfgs: list[MazeDatasetConfig] = [
    MazeDatasetConfig(
        name="test",
        grid_n=grid_n,
        n_mazes=n_mazes,
        maze_ctor=GENERATORS_MAP["gen_dfs"],
    )
    for grid_n, n_mazes in itertools.product(
        [10],
        np.logspace(1, 2, 2, dtype=int).tolist(),  # 100, for CI tests
        # np.logspace(1, 3, 5, dtype=int).tolist(), # 1k
        # np.logspace(0, 4, 9, dtype=int).tolist(), # 10k, notebook results from this set
    )
]

datasets: list[MazeDataset] = [
    MazeDataset.from_config(cfg, load_local=False) for cfg in cfgs
]

## Profile

In [None]:
columns: list[str] = [
    "grid_n",
    "n_mazes",
    "serialize",
    "serialize_minimal",
    "load",
    "load_minimal",
    "save",
    "save_minimal",
    "read",
    "read_minimal",
]
speeds_data: list[dict] = list()

In [None]:
def wrapped_timeit_fancy(
    name: str,
    function: Callable,
    do_profiling: bool,
    repeats: int,
    timing_stat: Callable[[StatCounter], float],
) -> tuple[dict, Any]:
    output: dict = dict()

    result: FancyTimeitResult = timeit_fancy(
        function,
        get_return=True,
        do_profiling=do_profiling,
        repeats=repeats,
    )

    output[name] = timing_stat(result.timings)
    output[f"{name}:stats"] = result.timings
    if do_profiling:
        output[f"{name}:profiling"] = result.profile

    return output, result.return_value


def measure_dataset_speed(
    d: MazeDataset,
    do_profiling: bool = True,
    repeats: int = 1,
    timing_stat: Callable[[StatCounter], float] = StatCounter.min,
) -> dict:
    if repeats > 1:
        warnings.warn(
            "Repeats > 1, results might not be accurate due to generation metadata being collected."
        )
    kwargs_fancy_timeit: dict = dict(
        do_profiling=do_profiling,
        timing_stat=timing_stat,
        repeats=repeats,
    )
    set_serialize_minimal_threshold(None)
    _d_cpy: MazeDataset = copy.deepcopy(d)
    # set up row data
    row_data: dict = dict(
        grid_n=d.cfg.grid_n,
        n_mazes=d.cfg.n_mazes,
    )
    # serialization & loading
    info_serialize, result_serialize = wrapped_timeit_fancy(
        "serialize_full", _d_cpy._serialize_full, **kwargs_fancy_timeit
    )
    row_data.update(info_serialize)
    _d_cpy = copy.deepcopy(d)

    info_serialize_min, result_serialize_min = wrapped_timeit_fancy(
        "serialize_minimal", _d_cpy._serialize_minimal, **kwargs_fancy_timeit
    )
    row_data.update(info_serialize_min)
    _d_cpy = copy.deepcopy(d)

    # info_serialize_min_alt, result_serialize_min_alt = wrapped_timeit_fancy(
    #     'serialize_minimal_alt', _d_cpy._serialize_minimal_alt, **kwargs_fancy_timeit
    # )
    # row_data.update(info_serialize_min_alt)
    _d_cpy = copy.deepcopy(d)
    info_serialize_cat, result_serialize_cat = wrapped_timeit_fancy(
        "serialize_minimal_soln_cat",
        _d_cpy._serialize_minimal_soln_cat,
        **kwargs_fancy_timeit,
    )
    row_data.update(info_serialize_cat)
    _d_cpy = copy.deepcopy(d)

    row_data.update(
        wrapped_timeit_fancy(
            "load_legacy",
            lambda: MazeDataset._load_legacy(result_serialize),
            **kwargs_fancy_timeit,
        )[0]
    )
    row_data.update(
        wrapped_timeit_fancy(
            "load_full",
            lambda: MazeDataset._load_full(result_serialize),
            **kwargs_fancy_timeit,
        )[0]
    )
    row_data.update(
        wrapped_timeit_fancy(
            "load_minimal",
            lambda: MazeDataset._load_minimal(result_serialize_min),
            **kwargs_fancy_timeit,
        )[0]
    )
    row_data.update(
        wrapped_timeit_fancy(
            "load_minimal_soln_cat",
            lambda: MazeDataset._load_minimal_soln_cat(result_serialize_cat),
            **kwargs_fancy_timeit,
        )[0]
    )

    row_data.update(
        wrapped_timeit_fancy(
            "load_full",
            lambda: MazeDataset._load_full(result_serialize),
            **kwargs_fancy_timeit,
        )[0]
    )
    row_data.update(
        wrapped_timeit_fancy(
            "load_minimal",
            lambda: MazeDataset._load_minimal(result_serialize_min),
            **kwargs_fancy_timeit,
        )[0]
    )
    row_data.update(
        wrapped_timeit_fancy(
            "load_minimal_soln_cat",
            lambda: MazeDataset._load_minimal_soln_cat(result_serialize_cat),
            **kwargs_fancy_timeit,
        )[0]
    )

    # saving and loading
    path_default: str = f"../data/{d.cfg.to_fname()}.zanj"
    path_min: str = f"../data/{d.cfg.to_fname()}_min.zanj"

    # default
    set_serialize_minimal_threshold(None)
    _d_cpy = copy.deepcopy(d)
    row_data.update(
        wrapped_timeit_fancy(
            "save", lambda: _d_cpy.save(file_path=path_default), **kwargs_fancy_timeit
        )[0]
    )
    _d_cpy = copy.deepcopy(d)

    # read_legacy
    set_serialize_minimal_threshold(-1)
    row_data.update(
        wrapped_timeit_fancy(
            "read_legacy",
            lambda: MazeDataset.read(file_path=path_default),
            **kwargs_fancy_timeit,
        )[0]
    )

    # default read
    set_serialize_minimal_threshold(None)
    row_data.update(
        wrapped_timeit_fancy(
            "read",
            lambda: MazeDataset.read(file_path=path_default),
            **kwargs_fancy_timeit,
        )[0]
    )

    # minimal
    set_serialize_minimal_threshold(0)
    _d_cpy = copy.deepcopy(d)
    row_data.update(
        wrapped_timeit_fancy(
            "save_minimal",
            lambda: _d_cpy.save(file_path=path_min),
            **kwargs_fancy_timeit,
        )[0]
    )
    _d_cpy = copy.deepcopy(d)

    row_data.update(
        wrapped_timeit_fancy(
            "read_minimal",
            lambda: MazeDataset.read(file_path=path_min),
            **kwargs_fancy_timeit,
        )[0]
    )

    # asserts
    # assert d == read_default
    # assert d == read_minimal

    # reset cfg?
    set_serialize_minimal_threshold(None)

    return row_data

## Run Profiling

In [None]:
for i, d in enumerate(datasets):
    print(f"Profiling {i + 1}/{len(datasets)}:\t{d.cfg}")
    result = measure_dataset_speed(d)
    speeds_data.append(result)
    cols_short: str = str({k: v for k, v in result.items() if ":" not in k})
    print(f"\t{cols_short}")
    print(f"\t{str(d.cfg)}")

### Results

In [None]:
SPEEDS: pd.DataFrame = pd.DataFrame(speeds_data)

SPEEDS

In [None]:
def compute_speedups(speeds: pd.DataFrame) -> pd.DataFrame:
    # for prefix in column_measurement_prefixes:
    #     speeds[f'{prefix}_speedup'] = speeds[f'{prefix}_full'] / speeds[f'{prefix}_minimal']
    speeds["serialize/speedup"] = speeds["serialize_full"] / speeds["serialize_minimal"]
    speeds["load/speedup"] = speeds["load_full"] / speeds["load_minimal"]
    speeds["save/speedup"] = speeds["save"] / speeds["save_minimal"]
    speeds["read/speedup"] = speeds["read"] / speeds["read_minimal"]

    return speeds


SPEEDS = compute_speedups(SPEEDS)

In [None]:
SPEEDS: pd.DataFrame = pd.DataFrame(speeds_data)

# SPEEDS.loc[:,"load_legacy":"load_minimal_soln_cat:profiling"]
SPEEDS.loc[:, "read_legacy":"read:profiling"]

In [None]:
SPEEDS.columns

In [None]:
def compute_speedups(speeds: pd.DataFrame) -> pd.DataFrame:
    # for prefix in column_measurement_prefixes:
    #     speeds[f'{prefix}_speedup'] = speeds[f'{prefix}_full'] / speeds[f'{prefix}_minimal']
    speeds["serialize/speedup"] = speeds["serialize_full"] / speeds["serialize_minimal"]
    speeds["load_minimal/speedup"] = speeds["load_legacy"] / speeds["load_minimal"]
    speeds["load/speedup"] = speeds["load_legacy"] / speeds["load_full"]
    speeds["save/speedup"] = speeds["save"] / speeds["save_minimal"]
    speeds["read_minimal/speedup"] = speeds["read_legacy"] / speeds["read_minimal"]
    speeds["read/speedup"] = speeds["read_legacy"] / speeds["read"]

    return speeds


SPEEDS = compute_speedups(SPEEDS)

In [None]:
SPEEDS[[c for c in SPEEDS.columns if ":" not in c]]

In [None]:
def plot_speeds(
    speeds: pd.DataFrame,
    column_measurement_prefixes: list[str] = ["serialize", "load", "save", "read"],
) -> None:
    n_measurements: int = len(column_measurement_prefixes)
    fig, axs = plt.subplots(2, n_measurements, figsize=(n_measurements * 5, 10))

    unique_grid_ns: list[int] = speeds["grid_n"].unique().tolist()

    for i, prefix in enumerate(column_measurement_prefixes):
        print(f"Plotting {prefix} timings and speedups")
        for grid_n in unique_grid_ns:
            print(f"Plotting grid_n={grid_n}")
            # raw timings
            ax_timings = axs[0, i]
            speeds_masked = speeds[speeds["grid_n"] == grid_n].sort_values("n_mazes")
            x_n_mazes = speeds_masked["n_mazes"]

            # Plotting
            for col in speeds_masked.columns:
                if (prefix in col) and ("speedup" not in col) and (":" not in col):
                    ax_timings.plot(
                        x_n_mazes,
                        speeds_masked[col],
                        "x-",
                        label=f"grid_n={grid_n}, {col}",
                    )

            # Setting multiple properties with `set`
            ax_timings.set(
                xscale="log",
                yscale="log",
                xlabel="Number of mazes",
                ylabel="Runtime [sec]",
                title=f"{prefix} timings",
            )
            ax_timings.legend()

            # speedups
            ax_speedups = axs[1, i]
            col_name: str = (
                f"{prefix}" if prefix in ("serialize", "save") else f"{prefix}_minimal"
            )
            ax_speedups.plot(
                x_n_mazes,
                speeds_masked[f"{col_name}/speedup"],
                "x-",
                label=f"grid_n={grid_n}",
            )

            # Setting multiple properties with `set` for ax_speedups
            ax_speedups.set(
                xscale="log",
                yscale="log",
                xlabel="Number of mazes",
                ylabel="Speedup",
                title=f"{col_name} speedups",
            )
            ax_speedups.plot(
                x_n_mazes,
                speeds_masked[f"{prefix}/speedup"],
                "x-",
                label=f"grid_n={grid_n}",
            )

            # Setting multiple properties with `set` for ax_speedups
            ax_speedups.set(
                xscale="log",
                yscale="log",
                xlabel="Number of mazes",
                ylabel="Speedup",
                title=f"{prefix} speedups",
            )
            ax_speedups.legend()


plot_speeds(SPEEDS)

Speedups plotted on the bottom set of axes all show the `_minimal` compared to the legacy performance. `serialize_full` and `save` are unchanged from the legacy version, so speedups are plotted relative to those vectors.

In [None]:
SPEEDS[["grid_n", "n_mazes", "serialize_minimal:profiling"]]

In [None]:
SPEEDS["load_minimal:profiling"][len(SPEEDS) - 1].sort_stats("tottime").print_stats()