# Data framework: the basic paradigm

user implements one function define_experiment

then runs run_experiments.py

it runs potentially many experimental trials (over all defined configurations), captures output, builds a sqlite database, queries it, produces plots, produces html pages to display plots...

also lots of tools to do querying, plot generation and analysis in jupyter notebooks.

# Run the following code cell before any others

It does basic initialization for this notebook.

In [None]:
import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *
print("Initialized.")

# The 'hello world' of `run_experiments.sh`

defining a trivial experiment that compiles and runs a single command once and saves the output.

we do run_in_jupyter and pass define_experiment. could save define_experiment in a python file and run the equivalent run_experiments.sh command...

In [None]:
def define_experiment(exp_dict, args):
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')     ## working dir for compiling
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin') ## working dir for running
    set_cmd_compile  (exp_dict, 'make -j all')
    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/libjemalloc.so numactl --interleave=all time ./ubench_brown_ext_abtree_lf.alloc_new.reclaim_debra.pool_none.out -nwork 1 -nprefill 1 -insdel 5 5 -k 200000 -t 3000')

run_in_jupyter(define_experiment, cmdline_args='--production')
# assuming that the define_experiment() function above is saved in myexp.py,
# the run_in_jupyter line above is equivalent to running:
#   ../../tools/data_framework/run_experiments.sh myexp.py --production

# (Re)running results without compiling

we will remedy the warning message soon...

introduce rerunning experiments without compiling

In [None]:
def define_experiment(exp_dict, args):
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')     ## working dir for compiling
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin') ## working dir for running
    set_cmd_compile  (exp_dict, 'make -j all')
    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../lib/libjemalloc.so numactl --interleave=all time ./ubench_brown_ext_abtree_lf.alloc_new.reclaim_debra.pool_none.out -nwork 1 -nprefill 1 -insdel 5 5 -k 200000 -t 3000')

run_in_jupyter(define_experiment, cmdline_args='--production --no-compile')
# equiv: [...]/run_experiments.sh myexp.py --production --no-compile


# Data files (captured stdout/err)

introduce concept of data files (capturing the output of the run)

this is the output of that one run command we did

In [None]:
print(shell_to_str('cat data/data000001.txt'))

# Running with varying parameters

of course running one command isn't very interesting...

introduce run params

    ## add parameters that you want your experiments to be run with.
    ## your program will be run once for each set of values in the CROSS PRODUCT of all parameters.
    ## (i.e., we will run your program with every combination of parameters)

introduce replacement strings: {DS_TYPENAME}

    ## you can use any of the run params you define to dynamically replace {_tokens_like_this} in strings. for example, we can include {DS_TYPENAME} in our run command, and it will be replaced by the current value of {DS_TYPENAME} (that's right, we can run different commands based on the current value of DS_TYPENAME)
    ## you can also get the paths to key directories by using:
    ##      {__dir_compile}
    ##      {__dir_run}
    ##      {__dir_data}
    ##
    ## the following replacement token is also defined for you:
    ##      {__step}            the number of runs done so far, padded to six digits with leading zeros


In [None]:
def define_experiment(exp_dict, args):
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin')
    set_cmd_compile  (exp_dict, 'make -j all')

    add_run_param    (exp_dict, 'DS_TYPENAME', ['brown_ext_ist_lf', 'brown_ext_abtree_lf', 'bronson_pext_bst_occ'])

    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../../lib/libjemalloc.so numactl --interleave=all time ./ubench_{DS_TYPENAME}.alloc_new.reclaim_debra.pool_none.out -nwork 1 -nprefill 1 -insdel 5 5 -k 200000 -t 3000')

run_in_jupyter(define_experiment, cmdline_args='--production --no-compile')

# Extracting data fields from captured stdout/err

note 3 data files were produced this time... one for each value of `DS_TYPENAME`. let's put those data files to use and get rid of that warning, by specifying that we want to *extract* some text from each data file.

in particular, let's extract a line of the form "`DS_TYPENAME=...`" and a line of the form "`total_throughput=...`" from each data file. (you can find such lines in the data file above if you like.)

extracted data is stored in a sqlite database `data/output_database.sqlite` in a table called `data`. (each field name passed to `add_data_field` becomes a **column** in `data`.)

to do this, we call `add_data_field()`.

In [None]:
def define_experiment(exp_dict, args):
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin')
    set_cmd_compile  (exp_dict, 'make -j all')

    add_run_param    (exp_dict, 'DS_TYPENAME', ['brown_ext_ist_lf', 'brown_ext_abtree_lf', 'bronson_pext_bst_occ'])

    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../../lib/libjemalloc.so numactl --interleave=all time ./ubench_{DS_TYPENAME}.alloc_new.reclaim_debra.pool_none.out -nwork 1 -nprefill 1 -insdel 5 5 -k 200000 -t 3000')

    add_data_field   (exp_dict, 'DS_TYPENAME')
    add_data_field   (exp_dict, 'total_throughput')

run_in_jupyter(define_experiment, cmdline_args='--production --no-compile')

# Querying the database

Note that we can simply **access** the last database we created, *WITHOUT rerunning* any experiments, by adding `--no-run --no-createdb` to `cmdline_args` in our `run_in_jupyter` call.

Also note that you can accomplish the same thing from the **command line** by running `../../tools/data_framework/run_experiments.py` with the **same** `cmdline_args`. However, since you can't pass your `define_experiments` function as a command line argument, you have to save it in a `.py` file and pass the name of that file as the first argument to `run_experiments.py`.

To query the database, we can use function `select_to_dataframe(sql_string)` with a suitable `sql_string`. There are many other powerful functions included for querying and plotting data, but those are covered in `microbench_experiments/example/instructions_data.ipynb`. In **this** notebook we are focusing on the design of the `define_experiment` function.

In [None]:
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile --no-run --no-createdb')
select_to_dataframe('select * from data')

# run_in_jupyter call above has equivalent command:
# [...]/run_experiments.sh myexp.py --production --no-compile --no-run --no-createdb


# Suppressing logging output in `run_in_jupyter`

If you want to call `run_in_jupyter` without seeing the logs copied to stdout, you can disable the log output by calling `disable_tee_stdout()`. Note that logs will still occur, but the output will **only** go to the log file `output_log.txt`.

In [None]:
disable_tee_stdout()
run_in_jupyter(define_experiment, cmdline_args='--production --no-compile --no-run --no-createdb')
select_to_dataframe('select * from data')

# Extractors: mining data from arbitrary text

    ## by default, a field "XYZ" will be fetched from each data file using extractor grep_line,
    ##      which greps (searches) for a line of the form "XYZ=[arbitrary string]\n"
    ##
    ## if your field is not stored in that format, you can specify a custom "extractor" function,
    ##      as we do in our example "get_maxres" BELOW, to extract the max resident size
    ##      from the 6th space-separated column of the output of the linux "time" command
    ##
    ## also note: each of these fields becomes a replacement token, e.g., {PAPI_L3_TCM}.
    ##
    ## the following special fields are also defined for you:
    ##      {__step}            the number of runs done so far, padded to six digits with leading zeros
    ##      {__cmd_run}         your cmd_run string with any tokens replaced appropriately for this run
    ##      {__file_data}       the output filename for the current run's data
    ##      {__path_data}       the relative path to the output file for the current run's data
    ##      {__hostname}        the result of running the hostname command on the machine
    ##      {__id}              a unique row ID

    ## note: in the following, defaults are "validator=is_nonempty" and "extractor=grep_line"


### Text output we are *trying* to extract max resident size from

In [None]:
## text output we are trying to extract max resident size from in MB


### Extractor that accomplishes this

### **Using** this extractor in `define_experiment`

# Validators: *checking* extracted data

# Running multiple trials

    ## if you want to perform repeated trials of each experimental configuration, add a run_param called "__trial"
    ##     and specify a list of trial numbers (as below).
    ##
    ## (the run_param doesn't *need* to be called __trials exactly, but if it is called __trials exactly,
    ##     then extra sanity checks will be performed to verify, for example, that each data point in a graphical plot
    ##     represents the average of precisely as many experimental runs as there are entries in the __trials list.)


In [None]:
import sys ; sys.path.append('../../tools/data_framework') ; from run_experiment import *

def define_experiment(exp_dict, args):
    set_dir_compile  (exp_dict, os.getcwd() + '/../../microbench')
    set_dir_run      (exp_dict, os.getcwd() + '/../../microbench/bin')
    set_cmd_compile  (exp_dict, 'make -j all')

    add_run_param    (exp_dict, '__trials', [1, 2, 3])
    add_run_param    (exp_dict, 'DS_TYPENAME', ['brown_ext_ist_lf', 'brown_ext_abtree_lf', 'bronson_pext_bst_occ'])

    set_cmd_run      (exp_dict, 'LD_PRELOAD=../../../lib/libjemalloc.so numactl --interleave=all time ./ubench_{DS_TYPENAME}.alloc_new.reclaim_debra.pool_none.out -nwork 1 -nprefill 1 -insdel 5 5 -k 200000 -t 1000')

    add_data_field   (exp_dict, 'DS_TYPENAME')
    add_data_field   (exp_dict, 'total_throughput')

run_in_jupyter(define_experiment, cmdline_args='--production --no-compile')

### Querying the data (to see the multiple trials)

In [None]:
select_to_dataframe('select * from data')

# todo

- validators
- extractors
- testing mode
- plots
- pages
- special reserved data field names
- custom output filename pattern, output path
- best-effort automated sanity checks
- complete control with custom plot function

    ## pattern for output filenames. note 1: these files will be placed in {__dir_data}/. note 2: filenames cannot contain spaces.
    set_file_data   ( exp_dict, 'data{__step}.txt' )

In [None]:
def define_experiment(exp_dict, args):
    set_dir_tools   (exp_dict, os.getcwd() + '/../../tools')      ## path to prereq. tools library
    set_dir_compile (exp_dict, os.getcwd() + '/../../microbench') ## working directory for compile
    set_dir_run     (exp_dict, os.getcwd() + '/bin')              ## working directory for run
    set_dir_data    (exp_dict, os.getcwd() + '/data')             ## path for output files

    add_run_param   (exp_dict, '__trials', [1, 2, 3])
    add_run_param   (exp_dict, 'DS_TYPENAME', ['brown_ext_ist_lf', 'brown_ext_abtree_lf', 'bronson_pext_bst_occ'])

    set_cmd_compile (exp_dict, 'make -j all bin_dir={__dir_run}')
    set_cmd_run     (exp_dict, 'LD_PRELOAD=../../../lib/libjemalloc.so numactl --interleave=all time ./ubench_{DS_TYPENAME}.alloc_new.reclaim_debra.pool_none.out -nwork 190 -nprefill 190 -insdel 5 5 -k 2000000 -t 3000 -rq 0 -rqsize 1 -nrq 0')

    set_file_data   (exp_dict, 'data_{DS_TYPENAME}.txt')

    add_data_field  (exp_dict, 'DS_TYPENAME'      , coltype='TEXT'    , validator=is_run_param('DS_TYPENAME'))
    add_data_field  (exp_dict, 'total_throughput' , coltype='INTEGER' , validator=is_positive)

    add_plot_set( \
            exp_dict \
          , name='throughput.png' \
          , title='Throughput vs data structure' \
          , x_axis='DS_TYPENAME' \
          , y_axis='total_throughput' \
          , plot_type='bars' \
    )
    return exp_dict
