# Profile of `maze_dataset` Dumping and Loading

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pytest

from maze_dataset.dataset.maze_dataset import (
    MazeDataset,
    MazeDatasetConfig,
)
from maze_dataset.generation.generators import GENERATORS_MAP

from maze_dataset.utils import timeit_fancy

## Generate Datasets


In [2]:
cfgs: list[MazeDatasetConfig] = [
    MazeDatasetConfig(name="test", **kwargs)
    for kwargs in [
		dict(grid_n=3, n_mazes=1, maze_ctor=GENERATORS_MAP['gen_dfs'], maze_ctor_kwargs={}, serialize_minimal_threshold=1),
		dict(grid_n=5, n_mazes=10, maze_ctor=GENERATORS_MAP['gen_dfs'], maze_ctor_kwargs=dict(do_forks=False), serialize_minimal_threshold=None),
		dict(grid_n=10, n_mazes=100, maze_ctor=GENERATORS_MAP['gen_dfs'], maze_ctor_kwargs={}, serialize_minimal_threshold=None),
		dict(grid_n=5, n_mazes=10, maze_ctor=GENERATORS_MAP['gen_dfs'], maze_ctor_kwargs={}, serialize_minimal_threshold=None),
		dict(grid_n=5, n_mazes=100, maze_ctor=GENERATORS_MAP['gen_dfs'], maze_ctor_kwargs={}, serialize_minimal_threshold=None),
		dict(grid_n=5, n_mazes=1000, maze_ctor=GENERATORS_MAP['gen_dfs'], maze_ctor_kwargs={}, serialize_minimal_threshold=None),
    ]
]

datasets: list[MazeDataset] = [MazeDataset.generate(cfg) for cfg in cfgs]
old_len_cfgs: int = len(cfgs)  # Used in section for large dataset profiling

## Profile

In [3]:
columns: list[str] = ['grid_n', 'n_mazes', 'serialize', 'serialize_minimal', 'load', 'load_minimal', 'save', 'save_minimal', 'read', 'read_minimal']
speeds_data: list[dict] = list()


In [4]:
def measure_dataset_speed(d: MazeDataset) -> dict:
    # set up row data
    row_data: dict = dict(
        grid_n=d.cfg.grid_n,
        n_mazes=d.cfg.n_mazes,
    )
    # serialization & loading
    d.cfg.serialize_minimal_threshold = None
    row_data['serialize'], ser_default = timeit_fancy(d.serialize, get_return=True)
    row_data['serialize_minimal'], ser_min = timeit_fancy(d.serialize_minimal, get_return=True)
    row_data['load'] = timeit_fancy(lambda: MazeDataset.load(ser_default))
    row_data['load_minimal'] = timeit_fancy(lambda: MazeDataset.load(ser_min))

    # saving and loading
    path_default: str = f'../data/{d.cfg.to_fname()}.zanj'
    path_min: str = f'../data/{d.cfg.to_fname()}_min.zanj'

    # default
    d.cfg.serialize_minimal_threshold = None
    row_data['save'] = timeit_fancy(lambda: d.save(file_path=path_default))
    row_data['read'], read_default = timeit_fancy(lambda: MazeDataset.read(file_path=path_default), get_return=True)
    # minimal
    d.cfg.serialize_minimal_threshold = 0
    row_data['save_minimal'] = timeit_fancy(lambda: d.save(file_path=path_min))
    row_data['read_minimal'], read_minimal = timeit_fancy(lambda: MazeDataset.read(file_path=path_min), get_return=True)

    # asserts
    # assert d == read_default
    # assert d == read_minimal

    # reset cfg?
    d.cfg.serialize_minimal_threshold = None

    return row_data

## Profile small datasets only

In [5]:
for i, d in enumerate(datasets):
    print(f'Profiling: {d.cfg}')
    speeds_data.append(measure_dataset_speed(d))

Profiling: MazeDatasetConfig(name='test', seq_len_min=1, seq_len_max=512, seed=42, applied_filters=[], grid_n=3, n_mazes=1, maze_ctor=<function LatticeMazeGenerators.gen_dfs at 0x0000017A67C6BD80>, maze_ctor_kwargs={}, serialize_minimal_threshold=1)
Profiling: MazeDatasetConfig(name='test', seq_len_min=1, seq_len_max=512, seed=42, applied_filters=[], grid_n=5, n_mazes=10, maze_ctor=<function LatticeMazeGenerators.gen_dfs at 0x0000017A67C6BD80>, maze_ctor_kwargs={'do_forks': False}, serialize_minimal_threshold=None)
Profiling: MazeDatasetConfig(name='test', seq_len_min=1, seq_len_max=512, seed=42, applied_filters=[], grid_n=10, n_mazes=100, maze_ctor=<function LatticeMazeGenerators.gen_dfs at 0x0000017A67C6BD80>, maze_ctor_kwargs={}, serialize_minimal_threshold=None)
Profiling: MazeDatasetConfig(name='test', seq_len_min=1, seq_len_max=512, seed=42, applied_filters=[], grid_n=5, n_mazes=10, maze_ctor=<function LatticeMazeGenerators.gen_dfs at 0x0000017A67C6BD80>, maze_ctor_kwargs={}, ser

### Results

In [6]:
SPEEDS: pd.DataFrame = pd.DataFrame(speeds_data, columns=columns)

def compute_speedups(speeds: pd.DataFrame, column_measurement_prefixes: list[str] = ['serialize', 'load', 'save', 'read']) -> pd.DataFrame:
    for prefix in column_measurement_prefixes:
        speeds[f'{prefix}_speedup'] = speeds[f'{prefix}'] / speeds[f'{prefix}_minimal']
    return speeds

SPEEDS = compute_speedups(SPEEDS)

In [8]:
SPEEDS

Unnamed: 0,grid_n,n_mazes,serialize,serialize_minimal,load,load_minimal,save,save_minimal,read,read_minimal,serialize_speedup,load_speedup,save_speedup,read_speedup
0,3,1,0.001816,0.004684,0.010739,0.011121,0.021051,0.024423,0.012759,0.014712,0.387763,0.965631,0.861929,0.867272
1,5,10,0.00185,0.005718,0.012284,0.011208,0.026159,0.026598,0.016718,0.015396,0.323517,1.095967,0.983468,1.085884
2,10,100,0.002202,0.014688,0.020319,0.008195,0.247172,0.038152,0.218833,0.011488,0.14994,2.479256,6.478559,19.048197
3,5,10,0.001857,0.005665,0.012256,0.011258,0.028804,0.025777,0.019413,0.015148,0.32774,1.08869,1.117426,1.281563
4,5,100,0.002222,0.014078,0.019727,0.007572,0.099197,0.033793,0.072562,0.010378,0.157862,2.60539,2.935398,6.992113
5,5,1000,0.006371,0.095593,0.143195,0.017081,0.284849,0.125186,0.662439,0.019039,0.066646,8.383326,2.275415,34.794242


In [None]:
fig = plt.figure(figsize=(10,8))
fig.subplots(2,2)
ax = fig.axes[0]
speeds.loc[3:, ['n_mazes', 'serialize', 'serialize_minimal']].plot(x='n_mazes', ax=ax, logx=True, logy=True)
ax.set_ylabel('Runtime [sec]')

ax = fig.axes[1]
speeds.loc[3:, ['n_mazes', 'load', 'load_minimal']].plot(x='n_mazes', ax=ax, logx=True, logy=True)
ax.set_ylabel('Runtime [sec]')

ax = fig.axes[2]
speeds.loc[3:, ['n_mazes', 'save', 'save_minimal']].plot(x='n_mazes', ax=ax, logx=True, logy=True)
ax.set_ylabel('Runtime [sec]')

ax = fig.axes[3]
speeds.loc[3:, ['n_mazes', 'read', 'read_minimal']].plot(x='n_mazes', ax=ax, logx=True, logy=True)
ax.set_ylabel('Runtime [sec]')

## Profile Large Datasets
This section should be skipped during the CI autoruns since running timing on large datasets takes awhile. `exit` is called so that execution stops and the cell outputs below are preserved. The outputs are for reference and manual rerun after further updates to `serialize`, `load`, `save`, etc. To manually rerun with all the datasets, just comment out the cell below first. Run, then uncomment it again before merging.

In [None]:
import sys
sys.exit(0)

In [None]:
cfgs.extend(
    [MazeDatasetConfig(name="test", grid_n=grid_n, n_mazes=n_mazes, maze_ctor=maze_ctor, maze_ctor_kwargs=maze_ctor_kwargs, serialize_minimal_threshold=srz_threshold) 
        for grid_n, n_mazes, maze_ctor, maze_ctor_kwargs, srz_threshold in [
            (5, 10000, GENERATORS_MAP['gen_dfs'], {}, None), 
        ]
    ]
)
datasets.extend([MazeDataset.generate(cfg) for cfg in cfgs[old_len_cfgs:]])

In [None]:
for i, d in enumerate(datasets):
    if i < old_len_cfgs: continue  # No need to rerun small datasets
    print(d.cfg)
    d.cfg.serialize_minimal_threshold = None
    speeds.loc[i, 'serialize']        , s    = timeit_fancy(d.serialize, get_return=True)
    speeds.loc[i, 'serialize_minimal'], smin = timeit_fancy(d.serialize_minimal, get_return=True)
    speeds.loc[i, 'load']                    = timeit_fancy(lambda: MazeDataset.load(s))
    speeds.loc[i, 'load_minimal']            = timeit_fancy(lambda: MazeDataset.load(smin))
    p = os.path.abspath(os.path.join(os.getcwd(), '..', 'data',d.cfg.to_fname()+'.zanj'))
    p_min = os.path.abspath(os.path.join(os.getcwd(), '..', 'data',d.cfg.to_fname()+'_min.zanj'))
    speeds.loc[i, 'save']                    = timeit_fancy(lambda: d.save(file_path=p))
    speeds.loc[i, 'read'], rt                = timeit_fancy(lambda: MazeDataset.read(file_path=p), get_return=True)
    d.cfg.serialize_minimal_threshold = 1
    speeds.loc[i, 'save_minimal']            = timeit_fancy(lambda: d.save(file_path=p_min))
    speeds.loc[i, 'read_minimal'], rt_min    = timeit_fancy(lambda: MazeDataset.read(file_path=p_min), get_return=True)
    d.cfg.serialize_minimal_threshold = None    

In [None]:
speeds.grid_n = [c.grid_n for c in cfgs]
speeds.n_mazes = [c.n_mazes for c in cfgs]

### Results

In [None]:
max_ind, max_n = np.argmax(speeds.n_mazes.values), np.max(speeds.n_mazes.values)
print(f'{max_n} mazes:')
for c_i in range(2, 10, 2):
    advantage = speeds.iloc[max_ind, c_i] / speeds.iloc[max_ind, c_i+1]
    print('`{}`: \tminimal/standard speedup: {:3.1f}x'.format(speeds.columns[c_i], advantage))

In [None]:
speeds

Comparing rows 2 and 4, it appears that the `grid_n` has a relatively small effect on `serialize` and `load` runtimes. Those functions appear to run in $O(n_{\mathrm{mazes}})$ time. `grid_n` does impact `save` and `read`, but not their `_minimal` counterparts as much.

To compare the speed of analogous procedures vs `n_mazes`, the plots below show data from `speeds.loc[3:,:]`.

In [None]:
fig = plt.figure(figsize=(10,8))
fig.subplots(2,2)
ax = fig.axes[0]
speeds.loc[3:, ['n_mazes', 'serialize', 'serialize_minimal']].plot(x='n_mazes', ax=ax, logx=True, logy=True)
ax.set_ylabel('Runtime [sec]')

ax = fig.axes[1]
speeds.loc[3:, ['n_mazes', 'load', 'load_minimal']].plot(x='n_mazes', ax=ax, logx=True, logy=True)
ax.set_ylabel('Runtime [sec]')

ax = fig.axes[2]
speeds.loc[3:, ['n_mazes', 'save', 'save_minimal']].plot(x='n_mazes', ax=ax, logx=True, logy=True)
ax.set_ylabel('Runtime [sec]')

ax = fig.axes[3]
speeds.loc[3:, ['n_mazes', 'read', 'read_minimal']].plot(x='n_mazes', ax=ax, logx=True, logy=True)
ax.set_ylabel('Runtime [sec]')