# Data framework: the basic paradigm

user implements one function define_experiment

then runs run_experiments.py

it runs potentially many experimental trials (over all defined configurations), captures output, builds a sqlite database, queries it, produces plots, produces html pages to display plots...

also lots of tools to do querying, plot generation and analysis in jupyter notebooks.

# Run the following code cell before any others

It does basic initialization for this notebook.

In [None]:
import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
print("Initialized.")

# The 'hello world' of `run_experiments.sh`

defining a trivial experiment that compiles and runs a single command once and saves the output.

we do run_in_jupyter and pass define_experiment. could save define_experiment in a python file and run the equivalent run_experiments.sh command...

In [None]:
def define_experiment(exp_dict, args):
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')     ## working dir for compiling
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin') ## working dir for running
    set_cmd_compile  (exp_dict, 'make brown_ext_abtree_lf.debra')
    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/libjemalloc.so numactl --interleave=all time ./brown_ext_abtree_lf.debra -nwork 1 -nprefill 1 -insdel 5 5 -k 200000 -t 1000')

import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
run_in_jupyter(define_experiment, cmdline_args='--production')
# if the define_experiment() function above were saved in a file myexp.py,
# then the run_in_jupyter line above is equivalent to running shell command:
#   ../../tools/data_framework/run_experiment.py myexp.py --production

# Try the same thing from the command line!

copy the define_experiment function definition above into a file called `myexp.py` (in this directory) and then run `../../tools/data_framework/run_experiment.py myexp.py --production` in the shell (in this directory)

# (Re)running results without compiling

introduce rerunning experiments without compiling

In [None]:
def define_experiment(exp_dict, args):
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')     ## working dir for compiling
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin') ## working dir for running
    set_cmd_compile  (exp_dict, 'make brown_ext_abtree_lf.debra')
    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/libjemalloc.so numactl --interleave=all time ./brown_ext_abtree_lf.debra -nwork 1 -nprefill 1 -insdel 5 5 -k 200000 -t 1000')

import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile')
# equiv cmd: [...]/run_experiment.py myexp.py --production --no-compile


# Data files (captured stdout/err)

introduce concept of data files (capturing the output of the run)

this is the output of that one run command we did

In [None]:
print(shell_to_str('cat data/data000001.txt'))

# Running with varying parameters

of course running one command isn't very interesting...

introduce run params

    ## add parameters that you want your experiments to be run with.
    ## your program will be run once for each set of values in the CROSS PRODUCT of all parameters.
    ## (i.e., we will run your program with every combination of parameters)

introduce replacement strings: {DS_TYPENAME}

note we now need to compile ALL of the binaries we want to run. we just change our make command to compile everything...

    ## you can use any of the run params you define to dynamically replace {_tokens_like_this} in strings. for example, we can include {DS_TYPENAME} in our run command, and it will be replaced by the current value of {DS_TYPENAME} (that's right, we can run different commands based on the current value of DS_TYPENAME)
    ## you can also get the paths to key directories by using:
    ##      {__dir_compile}
    ##      {__dir_run}
    ##      {__dir_data}
    ##
    ## the following replacement token is also defined for you:
    ##      {__step}            the number of runs done so far, padded to six digits with leading zeros


In [None]:
def define_experiment(exp_dict, args):
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin')
    set_cmd_compile  (exp_dict, 'make -j6') ## -j specifies how many threads to compile with

    add_run_param    (exp_dict, 'DS_TYPENAME', ['brown_ext_ist_lf', 'brown_ext_abtree_lf', 'bronson_pext_bst_occ'])

    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/libjemalloc.so numactl --interleave=all time ./{DS_TYPENAME}.debra -nwork 1 -nprefill 1 -insdel 5 5 -k 200000 -t 1000')

import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
run_in_jupyter(define_experiment, cmdline_args='--production')

# Extracting data fields from captured stdout/err

note 3 data files were produced this time... one for each value of `DS_TYPENAME`. let's put those data files to use by specifying that we want to *extract* some text from each data file.

in particular, let's extract a line of the form "`DS_TYPENAME=...`" and a line of the form "`total_throughput=...`" from each data file. (you can find such lines in the data file above if you like.)

extracted data is stored in a sqlite database `data/output_database.sqlite` in a table called `data`. (each field name passed to `add_data_field` becomes a **column** in `data`.)

to do this, we call `add_data_field()`.

In [None]:
def define_experiment(exp_dict, args):
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin')
    set_cmd_compile  (exp_dict, 'make -j6')

    add_run_param    (exp_dict, 'DS_TYPENAME', ['brown_ext_ist_lf', 'brown_ext_abtree_lf', 'bronson_pext_bst_occ'])

    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/libjemalloc.so numactl --interleave=all time ./{DS_TYPENAME}.debra -nwork 1 -nprefill 1 -insdel 5 5 -k 200000 -t 1000')

    add_data_field   (exp_dict, 'DS_TYPENAME')
    add_data_field   (exp_dict, 'total_throughput')

import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile')

# Querying the database

Note that we can simply **access** the last database we created, *WITHOUT rerunning* any experiments, by adding `--no-run --no-createdb` to `cmdline_args` in our `run_in_jupyter` call.

Also note that you can accomplish the same thing from the **command line** by running `../../tools/data_framework/run_experiments.py` with the **same** `cmdline_args`. However, since you can't pass your `define_experiments` function as a command line argument, you have to save it in a `.py` file and pass the name of that file as the first argument to `run_experiments.py`.

To query the database, we can use function `select_to_dataframe(sql_string)` with a suitable `sql_string`. There are many other powerful functions included for querying and plotting data, but those are covered in `microbench_experiments/example/instructions_data.ipynb`. In **this** notebook we are focusing on the design of the `define_experiment` function.

In [None]:
import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile --no-run --no-createdb')
df = select_to_dataframe('select * from data')
df

# run_in_jupyter call above has equivalent command:
# [...]/run_experiment.py myexp.py --production --no-compile --no-run --no-createdb


# Suppressing logging output in `run_in_jupyter`

If you want to call `run_in_jupyter` without seeing the logs copied to stdout, you can disable the log output by calling `disable_tee_stdout()`. Note that logs will still occur, but the output will **only** go to the log file `output_log.txt`.

In [None]:
import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
disable_tee_stdout()
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile --no-run --no-createdb')
df = select_to_dataframe('select * from data')
enable_tee_stdout() ## remember to enable, or you won't get output where you expect it...
df


# Running multiple trials

    ## if you want to perform repeated trials of each experimental configuration, add a run_param called "__trial"
    ##     and specify a list of trial numbers (as below).
    ##
    ## (the run_param doesn't *need* to be called __trials exactly, but if it is called __trials exactly,
    ##     then extra sanity checks will be performed to verify, for example, that each data point in a graphical plot
    ##     represents the average of precisely as many experimental runs as there are entries in the __trials list.)


In [None]:
def define_experiment(exp_dict, args):
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin')
    set_cmd_compile  (exp_dict, 'make -j6')

    add_run_param    (exp_dict, '__trials', [1, 2, 3])
    add_run_param    (exp_dict, 'DS_TYPENAME', ['brown_ext_ist_lf', 'brown_ext_abtree_lf', 'bronson_pext_bst_occ'])

    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/libjemalloc.so numactl --interleave=all time ./{DS_TYPENAME}.debra -nwork 1 -nprefill 1 -insdel 5 5 -k 200000 -t 1000')

    add_data_field   (exp_dict, 'DS_TYPENAME')
    add_data_field   (exp_dict, 'total_throughput')

import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile')

### Querying the data (to see the multiple trials)

In [None]:
select_to_dataframe('select * from data')

# Extractors: mining data from arbitrary text

    ## by default, a field "XYZ" will be fetched from each data file using extractor grep_line,
    ##      which greps (searches) for a line of the form "XYZ=[arbitrary string]\n"
    ##
    ## if your field is not stored in that format, you can specify a custom "extractor" function,
    ##      as we do in our example "get_maxres" BELOW, to extract the max resident size
    ##      from the 6th space-separated column of the output of the linux "time" command
    ##
    ## also note: each of these fields becomes a replacement token, e.g., {PAPI_L3_TCM}.
    ##
    ## the following special fields are also defined for you:
    ##      {__step}            the number of runs done so far, padded to six digits with leading zeros
    ##      {__cmd_run}         your cmd_run string with any tokens replaced appropriately for this run
    ##      {__file_data}       the output filename for the current run's data
    ##      {__path_data}       the relative path to the output file for the current run's data
    ##      {__hostname}        the result of running the hostname command on the machine
    ##      {__id}              a unique row ID

    ## note: in the following, defaults are "validator=is_nonempty" and "extractor=grep_line"


### Text output we are *trying* to extract max resident size from

In [None]:
## text output we are trying to extract max resident size from in MB


### Extractor that accomplishes this

### **Using** this extractor in `define_experiment`

# Validators: *checking* extracted data

# Plotting the results (for data with <ins>3 dimensions</ins>)

introduce `add_plot_set`

need command line arg `--do-plot`

In [None]:
def define_experiment(exp_dict, args):
    set_dir_tools    (exp_dict, os.getcwd() + '/../../tools') ## tools library for plotting
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin')
    set_cmd_compile  (exp_dict, 'make bin_dir={__dir_run} -j6')

    add_run_param    (exp_dict, '__trials', [1, 2])
    add_run_param    (exp_dict, 'TOTAL_THREADS', [1, 2, 4, 8])
    add_run_param    (exp_dict, 'DS_TYPENAME', ['brown_ext_ist_lf', 'brown_ext_abtree_lf', 'bronson_pext_bst_occ'])

    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/libjemalloc.so numactl --interleave=all time ./{DS_TYPENAME}.debra -nwork {TOTAL_THREADS} -nprefill {TOTAL_THREADS} -insdel 5 5 -k 200000 -t 1000')

    add_data_field   (exp_dict, 'DS_TYPENAME' , validator=is_run_param('DS_TYPENAME'))
    add_data_field   (exp_dict, 'TOTAL_THREADS', validator=is_run_param('TOTAL_THREADS'))
    add_data_field   (exp_dict, 'total_throughput' , coltype='INTEGER' , validator=is_positive)

    add_plot_set( \
            exp_dict \
          , name='throughput.png' \
          , title='Throughput vs data structure' \
          , series='DS_TYPENAME' \
          , x_axis='TOTAL_THREADS' \
          , y_axis='total_throughput' \
          , plot_type='bars' \
          , plot_cmd_args = '--legend-include' \
    )

import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile --do-plot')

## Let's view the data and plot produced by the previous cell

(You have to run the previous cell before running the next one.)

In [None]:
from IPython.display import Image
display(Image('data/throughput.png'))
display(select_to_dataframe('select * from data'))

# Producing *many* plots (for data with <ins>5 dimensions</ins>)

let's add a couple of dimensions:
- key range (`MAXKEY` in the data file)
- update rate (`INS_DEL_FRAC` in the data file)

and use them to produce **multiple plots** (one for each combination of values of these dimensions). we do this by specifying `varying_cols_list` in `add_plot_set`.

we can also customize the plot file`name`s and `title`s with these parameters.

# Showing these plots in a table in an HTML page

we also generate an HTML page to show off these grids in a table by invoking `add_page_set`.

HTML page construction only occurs if you specify command line argument `--do-pages` to `run_experiment.py`. so, we add this to `run_in_jupyter`.

In [None]:
def define_experiment(exp_dict, args):
    set_dir_tools    (exp_dict, os.getcwd() + '/../../tools') ## path to tools library
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin')
    set_cmd_compile  (exp_dict, 'make bin_dir={__dir_run} -j6')

    add_run_param    (exp_dict, '__trials', [1, 2])
    add_run_param    (exp_dict, 'TOTAL_THREADS', [1, 2, 4, 8])
    add_run_param    (exp_dict, 'DS_TYPENAME', ['brown_ext_ist_lf', 'brown_ext_abtree_lf', 'bronson_pext_bst_occ'])
    add_run_param    (exp_dict, 'MAXKEY', [20000, 200000])
    add_run_param    (exp_dict, 'INS_DEL_FRAC', ["0.0 0.0", "5.0 5.0"])

    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/libjemalloc.so numactl --interleave=all time ./{DS_TYPENAME}.debra -nwork {TOTAL_THREADS} -nprefill {TOTAL_THREADS} -insdel {INS_DEL_FRAC} -k {MAXKEY} -t 1000')

    add_data_field   (exp_dict, 'DS_TYPENAME' , validator=is_run_param('DS_TYPENAME'))
    add_data_field   (exp_dict, 'TOTAL_THREADS', coltype='INTEGER', validator=is_run_param('TOTAL_THREADS'))
    add_data_field   (exp_dict, 'INS_DEL_FRAC', validator=is_run_param('INS_DEL_FRAC'))
    add_data_field   (exp_dict, 'MAXKEY', coltype='INTEGER', validator=is_run_param('MAXKEY'))
    add_data_field   (exp_dict, 'total_throughput', coltype='INTEGER', validator=is_positive)

    ## we place the above legend at the bottom of *each* table by providing "legend_file"
    add_plot_set( \
            exp_dict \
          , name='throughput-{INS_DEL_FRAC}-{MAXKEY}k.png' \
          , title='{INS_DEL_FRAC} {MAXKEY}k: throughput' \
          , varying_cols_list=['MAXKEY', 'INS_DEL_FRAC'] \
          , series='DS_TYPENAME' \
          , x_axis='TOTAL_THREADS' \
          , y_axis='total_throughput' \
          , plot_type='bars' \
    )

    ## render one legend for all plots (since the legend is the same for all).
    ## if legend varies from plot to plot, you might enable legends for all plots,
    ## or write a custom plotting command that determines what to do, given your data
    add_plot_set(exp_dict, name='throughput-legend.png', series='DS_TYPENAME', x_axis='TOTAL_THREADS', y_axis='total_throughput', plot_type='bars', plot_cmd_args='--legend-only --legend-columns 3')

    ## we place the above legend at the bottom of *each* table by providing "legend_file"
    add_page_set( \
            exp_dict \
          , image_files='throughput-{INS_DEL_FRAC}-{MAXKEY}k.png' \
          , name='throughput' \
          , column_field='INS_DEL_FRAC' \
          , row_field='MAXKEY' \
          , legend_file='throughput-legend.png' \
    )

import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile --do-plot --do-pages')

## Let's view the plots produced by the previous cell

note you can click on the plots to "drill down" into the data.

In [None]:
show_html('data/throughput.html')

# How about 4 dimensions?

Let's removed the `MAXKEY` column / data dimension to reduce the dimensionality of the data to 4.

With only one column in the `varying_cols_list` and NO `row_field` specified in `add_page_set`, there will only be one row of plots. (So a strip of plots instead of a grid.)

In [None]:
def define_experiment(exp_dict, args):
    set_dir_tools    (exp_dict, os.getcwd() + '/../../tools') ## path to tools library
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin')
    set_cmd_compile  (exp_dict, 'make bin_dir={__dir_run} -j6')

    add_run_param    (exp_dict, '__trials', [1, 2])
    add_run_param    (exp_dict, 'TOTAL_THREADS', [1, 2, 4, 8])
    add_run_param    (exp_dict, 'DS_TYPENAME', ['brown_ext_ist_lf', 'brown_ext_abtree_lf', 'bronson_pext_bst_occ'])
    add_run_param    (exp_dict, 'INS_DEL_FRAC', ["0.0 0.0", "5.0 5.0"])

    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/libjemalloc.so numactl --interleave=all time ./{DS_TYPENAME}.debra -nwork {TOTAL_THREADS} -nprefill {TOTAL_THREADS} -insdel {INS_DEL_FRAC} -k 200000 -t 1000')

    add_data_field   (exp_dict, 'DS_TYPENAME' , validator=is_run_param('DS_TYPENAME'))
    add_data_field   (exp_dict, 'TOTAL_THREADS', coltype='INTEGER', validator=is_run_param('TOTAL_THREADS'))
    add_data_field   (exp_dict, 'INS_DEL_FRAC', validator=is_run_param('INS_DEL_FRAC'))
    add_data_field   (exp_dict, 'total_throughput', coltype='INTEGER', validator=is_positive)

    add_plot_set( \
            exp_dict \
          , name='throughput-{INS_DEL_FRAC}.png' \
          , title='{INS_DEL_FRAC}: throughput' \
          , varying_cols_list=['INS_DEL_FRAC'] \
          , series='DS_TYPENAME' \
          , x_axis='TOTAL_THREADS' \
          , y_axis='total_throughput' \
          , plot_type='bars' \
    )

    ## render one legend for all plots (since the legend is the same for all).
    ## if legend varies from plot to plot, you might enable legends for all plots,
    ## or write a custom plotting command that determines what to do, given your data
    add_plot_set(exp_dict, name='throughput-legend.png', series='DS_TYPENAME', x_axis='TOTAL_THREADS', y_axis='total_throughput', plot_type='bars', plot_cmd_args='--legend-only --legend-columns 3')

    ## we place the above legend at the bottom of *each* table by providing "legend_file"
    add_page_set( \
            exp_dict \
          , image_files='throughput-{INS_DEL_FRAC}.png' \
          , name='throughput' \
          , column_field='INS_DEL_FRAC' \
          , legend_file='throughput-legend.png' \
    )

import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile --do-plot --do-pages')

## Let's view the plots produced by the previous cell

In [None]:
show_html('data/throughput.html')

# Plots and HTML for data with <ins>6 dimensions</ins>

note that we could have added more than 2 dimensions of data (resulting in data with 6+ dimensions), listing potentially many fields in `varying_cols_list`, and this simply would have resulted in *more plots*.

note that if we had **one** more dimension of data (6 dimensions in total), it could be listed in the keyword argument `table_field`, and **multiple** HTML tables would be rendered in a single HTML page (one for each value of this column).

In [None]:
def define_experiment(exp_dict, args):
    set_dir_tools    (exp_dict, os.getcwd() + '/../../tools') ## path to tools library
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin')
    set_cmd_compile  (exp_dict, 'make bin_dir={__dir_run} -j6')

    add_run_param    (exp_dict, '__trials', [1])
    add_run_param    (exp_dict, 'TOTAL_THREADS', [2, 4, 8])
    add_run_param    (exp_dict, 'DS_TYPENAME', ['brown_ext_abtree_lf', 'bronson_pext_bst_occ'])
    add_run_param    (exp_dict, 'MAXKEY', [20000, 200000])
    add_run_param    (exp_dict, 'INS_DEL_FRAC', ['0.0 0.0', '5.0 5.0'])
    ## unlike the above four fields,
    ## the run command does NOT produce a line of the form 'malloc=[...]'.
    ## so, run_experiment.py will APPEND a line of this form to the datafile!
    add_run_param    (exp_dict, 'malloc', ['jemalloc', 'mimalloc'])

    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/lib{malloc}.so numactl --interleave=all time ./{DS_TYPENAME}.debra -nwork {TOTAL_THREADS} -nprefill {TOTAL_THREADS} -insdel {INS_DEL_FRAC} -k {MAXKEY} -t 1000')

    add_data_field   (exp_dict, 'DS_TYPENAME' , validator=is_run_param('DS_TYPENAME'))
    add_data_field   (exp_dict, 'TOTAL_THREADS', coltype='INTEGER', validator=is_run_param('TOTAL_THREADS'))
    add_data_field   (exp_dict, 'INS_DEL_FRAC', validator=is_run_param('INS_DEL_FRAC'))
    add_data_field   (exp_dict, 'MAXKEY', coltype='INTEGER', validator=is_run_param('MAXKEY'))
    add_data_field   (exp_dict, 'total_throughput', coltype='INTEGER', validator=is_positive)
    add_data_field   (exp_dict, 'malloc', validator=is_run_param('malloc'))

    add_plot_set( \
            exp_dict \
          , name='throughput-{malloc}-{INS_DEL_FRAC}-{MAXKEY}.png' \
          , title='{malloc} {INS_DEL_FRAC} {MAXKEY}' \
          , varying_cols_list=['malloc', 'MAXKEY', 'INS_DEL_FRAC'] \
          , series='DS_TYPENAME' \
          , x_axis='TOTAL_THREADS' \
          , y_axis='total_throughput' \
          , plot_type='bars' \
    )

    ## render one legend for all plots (since the legend is the same for all).
    ## if legend varies from plot to plot, you might enable legends for all plots,
    ## or write a custom plotting command that determines what to do, given your data
    add_plot_set(exp_dict, name='throughput-legend.png', series='DS_TYPENAME', x_axis='TOTAL_THREADS', y_axis='total_throughput', plot_type='bars', plot_cmd_args='--legend-only --legend-columns 3')

    ## note: choice of column / row / table field determines how the HTML page looks -- up to you!
    add_page_set( \
            exp_dict \
          , image_files='throughput-{malloc}-{INS_DEL_FRAC}-{MAXKEY}.png' \
          , name='throughput' \
          , column_field='INS_DEL_FRAC' \
          , row_field='MAXKEY' \
          , table_field='malloc' \
          , legend_file='throughput-legend.png' \
    )

import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile --do-plot --do-pages')

## Let's view the data, plots and HTML we produced

In [None]:
show_html('data/throughput.html')
display(select_to_dataframe('select * from data'))

# Plots and HTML for data with <ins>7+ dimensions</ins>

if we had MORE than one extra dimension of data (7+ dimensions in total), we could list additional fields in the keyword argument `page_field_list`, which would cause additional HTML pages to be rendered (one for each combination of values for fields in `page_field_list`), and linked together by an `index.htm`. (note that the `name` keyword argument of `page_field_list` must also be modified to reference these fields, in order for multiple HTML files to be created---you must specify what sort of naming convention you'd like the framework to use.)

In [None]:
def define_experiment(exp_dict, args):
    set_dir_tools    (exp_dict, os.getcwd() + '/../../tools') ## path to tools library
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin')
    set_cmd_compile  (exp_dict, 'make bin_dir={__dir_run} -j6')

    add_run_param    (exp_dict, '__trials', [1])
    add_run_param    (exp_dict, 'TOTAL_THREADS', [2, 8])
    add_run_param    (exp_dict, 'DS_TYPENAME', ['brown_ext_abtree_lf', 'bronson_pext_bst_occ'])
    add_run_param    (exp_dict, 'MAXKEY', [20000, 200000])
    add_run_param    (exp_dict, 'INS_DEL_FRAC', ['0.0 0.0', '5.0 5.0'])
    ## unlike the above four fields,
    ## the run command does NOT produce a line of the form 'malloc=[...]'.
    ## so, run_experiment.py will APPEND a line of this form to the datafile!
    add_run_param    (exp_dict, 'malloc', ['jemalloc', 'mimalloc'])
    ## ditto for reclaimer
    add_run_param    (exp_dict, 'numactl', ['', 'numactl --interleave=all'])

    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/lib{malloc}.so {numactl} time ./{DS_TYPENAME}.debra -nwork {TOTAL_THREADS} -nprefill {TOTAL_THREADS} -insdel {INS_DEL_FRAC} -k {MAXKEY} -t 1000')

    add_data_field   (exp_dict, 'DS_TYPENAME' , validator=is_run_param('DS_TYPENAME'))
    add_data_field   (exp_dict, 'TOTAL_THREADS', coltype='INTEGER', validator=is_run_param('TOTAL_THREADS'))
    add_data_field   (exp_dict, 'INS_DEL_FRAC', validator=is_run_param('INS_DEL_FRAC'))
    add_data_field   (exp_dict, 'MAXKEY', coltype='INTEGER', validator=is_run_param('MAXKEY'))
    add_data_field   (exp_dict, 'total_throughput', coltype='INTEGER', validator=is_positive)
    add_data_field   (exp_dict, 'malloc', validator=is_run_param('malloc'))
    add_data_field   (exp_dict, 'numactl', validator=is_run_param('numactl'))

    add_plot_set( \
            exp_dict \
          , name='throughput-{malloc}-{numactl}-{INS_DEL_FRAC}-{MAXKEY}.png' \
          , title='{INS_DEL_FRAC} {MAXKEY}' \
          , varying_cols_list=['malloc', 'numactl', 'MAXKEY', 'INS_DEL_FRAC'] \
          , series='DS_TYPENAME' \
          , x_axis='TOTAL_THREADS' \
          , y_axis='total_throughput' \
          , plot_type='bars' \
    )

    ## render one legend for all plots (since the legend is the same for all).
    ## if legend varies from plot to plot, you might enable legends for all plots,
    ## or write a custom plotting command that determines what to do, given your data
    add_plot_set(exp_dict, name='throughput-legend.png', series='DS_TYPENAME', x_axis='TOTAL_THREADS', y_axis='total_throughput', plot_type='bars', plot_cmd_args='--legend-only --legend-columns 3')

    ## we place the above legend at the bottom of *each* table by providing "legend_file"
    add_page_set( \
            exp_dict \
          , image_files='throughput-{malloc}-{numactl}-{INS_DEL_FRAC}-{MAXKEY}.png' \
          , name='throughput' \
          , column_field='numactl' \
          , row_field='malloc' \
          , table_field='MAXKEY' \
          , page_field_list=['INS_DEL_FRAC'] \
          , legend_file='throughput-legend.png' \
    )

import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile --no-run --do-plot --do-pages')

## Let's view the data, plots and HTML we produced

In [None]:
show_html('data/index.html')
display(select_to_dataframe('select * from data'))

# It's easy to plot *many* value fields vs your `run_params`

# notebook todo
- validators
- extractors
- testing mode
- special reserved data field names
- custom output filename pattern, output path
- best-effort automated sanity checks
- complete control with custom plot function
- archival benefits (zip: data, commit id, diffs from commit)

may want to support pages with columns covering a list of specific data values (throughput, l3miss, l2miss, cycles, maxresident, etc.)