This is the notebook, which contains the results of running our evaluation.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import math
import scipy.stats as stats
from pathlib import Path

def pretty(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    
    for spine in [ax.spines['left'], ax.spines['bottom'], ax.spines["right"], ax.spines['top']]:
        spine.set_position(("outward", 5))
        spine.set_color("gray")
        
    for axis in [ax.yaxis, ax.xaxis]:
        for x in axis.get_major_ticks():
            x.label1.set_color("gray")
            x.label2.set_color("gray")
            x.tick1line.set_color("gray")
            x.tick2line.set_color("gray")


# Full

This section covers the evaluation where we preserve the full bug. We start by loading the the data and indexing by `name`, `predicate`, and `strategy`. The data have been computed and put in `results/result.csv` by our evalutation framework.

In [None]:
results = pd.read_csv("result/full/result.csv").set_index(["name", "predicate","strategy"])


A single line of our data looks like this, we store the follwing data: 

*  `bugs` which contain the number of lines in the cleaned up bug-report

*  `initial-scc` and `scc` contain the number of strongly connected components before and after reduction,

*  `initial-classes` and `classes` contain the number of classes before and after reduction,

*  `initial-bytes` and `bytes` contain the number of bytes before and after reduction,

*  `iters` which contain the number of invocations of the predicate, 

*  `searches` the number of binary searches made by algorithm

*  `time` which records the time to reach the final successfull solution,

*  `status` which records whether the reduction completed correctly,

*  `verify` which records information about if bug is preserved.

Here is an example:


In [None]:
results.loc["url0067cdd33d_goldolphin_Mi", "cfr"]

In [None]:
results.loc["url0067cdd33d_goldolphin_Mi", "cfr", "items+logic"]

In [None]:
cnfs = pd.read_csv("result/full/sizes/cnfs.csv").set_index("name")
index = results.unstack("strategy").index
bybench = pd.DataFrame(dict(clauses=[ cnfs.clauses[n] for (n,v) in index], edges=[ cnfs.edges[n] for (n,v) in index]), index=index)
bybench["graphscore"] = bybench.edges / bybench.clauses
bybench

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(10,5), sharey=True)

bugs = results["bugs"].unstack("strategy")["classes"]
initial_bytes = results["initial-bytes"].unstack("strategy")["classes"]
initial_classes = results["initial-classes"].unstack("strategy")["classes"]
initial_variables = results["initial-scc"].unstack("strategy")["items+logic"]
clauses = cnfs.clauses[results.index]

number_of_benchmarks = len(bugs.index)

diagrams = [
    { "title": "Histogram of Classes"
    , "data": initial_classes
    , "xlabel": "Classes"
    },
    { "title": "Histogram of Bytes (in MB)"
    , "data": initial_bytes
    , "xformat" : lambda x, pos: f'{x / 1000000 :0.2f}'
    , "format" : lambda x, pos: f'{x / 1000 :0.0f} KB'
    , "xlabel": "Bytes (in MB)"
    },

    { "title": "Histogram of Bugs"
    , "data": bugs
    , "xlabel": "Errors in Output"
    , "format" : lambda x, pos: f'{x :0.1f}'
    , "splits": [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    },
    
    { "title": "Histogram of Variables"
    , "data": initial_variables
    , "xformat" : lambda x, pos: f'{x / 1000 :0.0f}k'
    , "format" : lambda x, pos: f'{x / 1000 :0.1f}k'
    , "xlabel": "Reducible Items"
    },
    { "title": "Histogram of Clauses"
    , "data": bybench.clauses
    , "xformat" : lambda x, pos: f'{x / 1000 :0.0f}k'
    , "format" : lambda x, pos: f'{x / 1000 :0.1f}k'
    , "xlabel": "Clauses"
    },
    { "title": "Histogram of Procentage"
    , "data": bybench.graphscore
    , "xformat" : lambda x, pos: f'{x * 100 :0.0f}%'
    , "format" : lambda x, pos: f'{x * 100 :0.0f}%'
    , "xlabel": "Edges per Clause"
    , "splits": np.linspace(bybench.graphscore.min(), 1, 11)
    },
    
]

axes[0][0].set_ylabel("Bencmarks")
axes[1][0].set_ylabel("Bencmarks")
for ax, diagram in zip(axes.flatten(), diagrams):
    pretty(ax)
    
    data = diagram["data"]
    xlim = (data.min(), data.max())
    splits = diagram.get("splits",np.linspace(*xlim, 11).round(0))
    
    
    ax.set_xlim(*xlim)
    ax.set_xticks(splits[::2])
    ax.set_xticks(splits, minor=True)
    
    
    ylim = (0, number_of_benchmarks)
    ax.set_ylim(*ylim)
    ax.set_yticks(np.linspace(*ylim, 8).round(0))
    if not ax in (axes[0][0], axes[1][0]):
        ax.spines["left"].set_visible(False)
        for x in ax.yaxis.get_major_ticks():
            x.set_visible(False)
    ax.set_xlabel(diagram["xlabel"])
   
    blocks = ax.hist(diagram["data"], splits, color="black", rwidth=0.75)
    
    
    xformat = diagram.get("xformat", lambda x, pos: f'{x:0.0f}')
    ax.xaxis.set_major_formatter(plt.FuncFormatter(xformat))
    #ax.xaxis.set_tick_params(rotation=70)
    
    gmean = stats.gmean(diagram["data"])
    v = ax.vlines(gmean, *ylim)
    v.set_color("gray")
    v.set_linestyle(":")
    
    t = ax.text(gmean + (xlim[1] - xlim[0]) * 0.05, ylim[1] * 0.94, "GM " + diagram.get("format", xformat)(gmean, 0))
    t.set_color("gray")
    
fig.tight_layout()
fig.subplots_adjust(wspace=0.18)
fig.savefig("benchmarks.eps")

We are testing two startegies: 

- classes
- logic


In [None]:
strategies = list(reversed(["classes", "items+logic"]))

p = ["#0a1058", "#ee4242", "#ff9135", "#9857ff", "#4cb2ff"]


colors = { "classes" : "#5F99E7",  "items+logic": "#1956A7"}
shade  = { "classes" : "#F6F1B0",  "items+logic": "#B0B6F6"}
labels = { "classes" : "J-Reduce", "items+logic": "Our Reducer"}
styles = { "classes" : "--",       "items+logic": "-" }


## Sanity Checks

*  How many procent do each strategy time out on?

In [None]:
fig, ax = plt.subplots(1, figsize=(10,0.7))

timeouts = (results.status == "timeout").groupby("strategy").mean()

ax.set_xlim(0, 100)

pretty(ax)
x = ax.barh(
        [labels[s] + " " + str((100 - timeouts[s] * 100).round(1)) + "%" for s in strategies], 
        [timeouts[s] * 100 for s in strategies], 
        color=[colors[s] for s in strategies],
    )

* How many benchmarks do 'classes' produce fewer classes than 'logic', and how many of them 
  are not due to timeouts?

In [None]:
outperforms = []
for (b, p, x) in results.index:
    if x != "classes": continue
    c = results.classes
    if c.loc[(b, p, x)] < c.loc[(b, p, "items+logic")]:
        outperforms.append(
            ( b + "/" + p
            , (c.loc[(b, p, "items+logic")] / c.loc[(b, p, "classes")]).round(1)
            , results.loc[(b,p,"items+logic")].status
            )
        )
len(outperforms), len([x for x in outperforms if x[2] != "timeout"])

In [None]:
results.loc["url9200ed8692_olabini_ioke", "fernflower"]

* Is 'classes' and 'jreduce' comparable in classes, bytes, and time?


 ## Comparative reduction

In our comparative reducetion results we will update the times of all timeout items.

In [None]:
TIMEOUT = 24 * 60 * 60
full = results.copy()
full.loc[full.status == "timeout", "time"] = TIMEOUT

In our first experiment we are going to look at comparative final size, and time. We use the geometric mean, so that we can compare the results:

In [None]:
r = full[["time", "bytes", "classes"]].groupby("strategy").agg(stats.gmean)
r.round(1)

In [None]:
(r.loc["classes"] / r.loc["jreduce"] * 100 - 100).round(1)

In [None]:
(r.loc["classes+logic"] / r.loc["classes"] * 100 - 100).round(3)

In [None]:
(r.loc["classes"] / r.loc["items+logic"]).round(2)

In [None]:
(r.loc["items+logic"] / r.loc["classes"]).round(2)

### Graphical Results

In [None]:
def draw_diagram(full):
    fig, axes = plt.subplots(1, 3, figsize=(10,3.5), sharey=True)
    
    diagrams = [
        { "title": "Finished Programs over Time"
        , "xformat": lambda x, pos: f'{x/3600:0.0f}:{x%3600/60:02.0f}'
        , "labelformat": lambda x: f'{x:0.1f}s'
        , "data": lambda s: list(sorted(d for d in full["time"].unstack("strategy")[s] if d < TIMEOUT))
        , "xlim": (0, 10 * 3600)
        , "xticks": np.linspace(0, 10*3600, 6)
        , "xlabel": "Time Spend (h:mm)"
        , "percent": False
        },
        { "title": "Finished Programs over Invocations"
        , "xformat": lambda x, pos: f'{x*100:0.0f}%'
        , "labelformat": lambda x: f'{x*100:0.1f}%'
        , "data": lambda s: sorted(full["classes"].unstack("strategy")[s] / initial_classes)
        , "xticks": np.linspace(0,1, 6)
        , "xlabel": "Final Relative Size (Classes)"
        },
        # { "title": "Finished Programs over Invocations"
        # , "xformat": lambda x, pos: f'{x:0.0f}'
        # , "data": lambda s: sorted(full["iters"].unstack("strategy")[s])
        # , "xlim": (0, full["iters"].max())
        # , "xlabel": "Invocations Made"
        # },
        { "title": "Finished Programs over Invocations"
        , "xformat": lambda x, pos: f'{x*100:0.0f}%'
        , "labelformat": lambda x: f'{x*100:0.1f}%'
        , "data": lambda s: sorted(full["bytes"].unstack("strategy")[s] / initial_bytes)
        , "xticks": np.linspace(0,1, 6)
        , "xlabel": "Final Relative Size (Bytes)"
        },

        
        ]

    for diagram, ax in zip(diagrams, axes.flatten()):
        maxx, minx = 0, 1000000000
        pretty(ax)
       
        strats = sorted(strategies, key=lambda s: np.mean(diagram["data"](s)))
        for s in strats:
            data = diagram["data"](s)
            ax.plot(data, [i + 1 for i,_ in enumerate(data)], 
                    label=labels[s], 
                    linestyle=styles[s],
                    color=colors[s])
            maxx = max(maxx, max(data))
            minx = min(minx, min(data))
            
            mean = stats.gmean(data)
            for i, x in enumerate(data):
                if x > mean: 
                    index = i + 1
                    break
            
            ax.scatter(mean, index, color=colors[s])
            if s == "items+logic":
                loc = (mean + 6 / 100 * diagram["xticks"][-1], index * 1.05 - 2)
            else:
                loc = (mean + 5 / 100 * diagram["xticks"][-1], index - 11)
            
            ax.text(*loc, diagram["labelformat"](mean)
                    , color=colors[s]
                    , bbox=dict(boxstyle="round", fc="white", ec="white")
                   )
            
        minx = max(1, minx)
        

        xlim = diagram.get("xlim", (0, maxx))
        ax.set_xlim(*xlim)
        xtics = diagram.get("xticks", np.linspace(*xlim, 7))
        ax.set_xticks(xtics)
        
        ylim = 0, number_of_benchmarks
        ax.set_yticks(np.linspace(*ylim, 7).round())
        ax.set_yticks([], minor=True)
        ax.set_ylim(*ylim)
        if ax == axes[0]:
            ax.set_ylabel("Benchmarks")
        
        
        if diagram.get("percent", False):
            ax2 = ax.twinx()
            pretty(ax2)
            
            yticks = [227, 200]
            strats = sorted(strategies, key=lambda s: -len(diagram["data"](s)))
            ytickslabels = [f"{(len(diagram['data'](s)) - 1) / number_of_benchmarks * 100:0.0f} %" for s in strats]
            ax2.set_yticks(yticks)
            ax2.set_yticklabels(ytickslabels)
            #ax2.set_ylabel("Completion Rate")
        
        
        ax.xaxis.set_major_formatter(plt.FuncFormatter(diagram.get("xformat", lambda x, pos: f'{x:0.0f}')))
        
        
        v = ax.hlines(round(number_of_benchmarks/2),*xlim)
        v.set_color("gray")
        v.set_linestyle(":")
                            
                            
        #v = ax.hlines(round(number_of_benchmarks * 0.95),*xlim)
        #v.set_color("gray")
        #v.set_linestyle(":")
                              
        if ax == axes[0]:
            # v = ax.vlines(full.time.unstack("strategy")["classes"].max() ,*ylim)
            # v.set_color("gray")
            # v.set_linestyle(":")
            #                 
            # v = ax.vlines(full.time.unstack("strategy")["items+logic"].quantile(0.95) ,*ylim)
            # v.set_color("gray")
            # v.set_linestyle(":")
                            
            # t = ax.text(xlim[1] * 0.45, 54 + 5, "ONE BUG")
            # t.set_color("gray")
            t = axes[0].text(15005, round(len(data)/2) + 4.5, "MEDIAN")
            t.set_color("gray")
                            
            
        
        ax.set_xlabel(diagram["xlabel"])    
    

    fig.tight_layout()
    fig.subplots_adjust(wspace=0.13)
    axes[2].legend(loc="lower right")
    return fig

fig = draw_diagram(full)
fig.savefig("timings.eps")

The graphs are formatted like the previous article: In the top row we have number programs that complete before a certain time and iterations. In the bottom row we have the number of programs that have been reduced to a size below a certian number of bytes or classes.


In [None]:
tbytes = pd.read_csv("result/full/bytes.csv").groupby("strategy").agg(stats.gmean).T.rename(int)
tclasses = pd.read_csv("result/full/classes.csv").groupby("strategy").agg(stats.gmean).T.rename(int)

fclasses = results.classes.groupby("strategy").agg(stats.gmean)
fbytes = results.bytes.groupby("strategy").agg(stats.gmean)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10,6))

diagrams = [
    { "title": "Mean Classes Left Over Time"
    , "data":  tclasses
    # , "quantiles": (tclasses1, tclasses2)
    , "format": lambda x, pos: f"{x:0.0f}"
    , "percent": True
    , "ylabel": "Mean Classes Left"
    , "best": fclasses
    },
    { "title": "Mean Bytes Left Over Time (h:m)"
    , "data":  tbytes
    , "format": lambda x, pos: f"{x / 1000:0.0f} KB"
    , "percent": True
    , "ylabel": "Mean Bytes Left"
    , "best": fbytes
    },
    { "title": "Mean Reduction of Classes Over Time"
    , "data":  tclasses.rdiv(tclasses.max())
    #, "quantiles": (tclasses1.rdiv(tclasses.max()), tclasses2.rdiv(tclasses.max()))
    , "format": lambda x, pos: f"x {x:0.0f}"
    , "ylim": (1, 25)
    , "yscale": "linear"
    , "yticks": np.linspace(1, 25, 7)
    , "ylabel": "Mean Times Smaller (Classes)"
    , "xlabel": "Time Spend (h:mm)"
    #, "yticks2": 
    #     zip([11, 10, 9, 5, 4],
    #         [f"{1/max(tclasses.rdiv(tclasses.max())[s])* 100:0.1f}%" for s in strategies ]
    #     )
    , "percent": False
    , "best": fclasses.rdiv(tclasses.max())
    },
    { "title": "Mean Reduction of Bytes Over Time"
    , "data":  tbytes.rdiv(tbytes.max())
    , "ylim": (1, 25)
    , "yscale": "linear"
    , "yticks": np.linspace(1, 25, 7)
    , "format": lambda x, pos: f"x {x:0.0f}"
    , "ylabel": "Mean Times Smaller (Bytes)"
    , "percent": False
    , "xlabel": "Time Spend (h:mm)"
    , "best": fbytes.rdiv(tbytes.max())
    }
    
   # ("Mean Percentage of Classes Left", dfCs, lambda x: x.mean()), 
   # ("Mean Percentage of Bytes Left", dfBs, lambda x: x.mean()),
   # ("Moving Geometric Mean of Relative Reduction of Classes", times[0], lambda x: x.agg(stats.gmean)), 
   # ("Moving Geometric Mean of Relative Reduction of Bytes", times[1], lambda x: x.agg(stats.gmean)),
   # ("Median Percentage of Classes Left", dfCs, lambda x: x.median()), 
   # ("Median Percentage of Bytes Left", dfBs, lambda x: x.median()),  
]

for ax, diagram in zip(axes.flatten(), diagrams):
    data = diagram["data"]
   
    pretty(ax)
    for s in reversed(strategies):
        ax.plot(data.index * 60, data[s], label=labels[s], color=colors[s], linestyle=styles[s])
        
        v = ax.hlines(diagram["best"][s],(data.index * 60).min(), (data.index * 60).max())
        v.set_color("lightgray")
        v.set_linestyle(":")
        
        quantiles = diagram.get("quantiles", None)
        if quantiles:
            low,high = quantiles
            ax.fill_between(low.index * 60, low[s], high[s], label=labels[s], color=shade[s], linestyle=styles[s])
            #ax.plot(high.index * 60, high[s], label=labels[s], color=colors[s], linestyle=styles[s])
            
        
    ylim = diagram.get("ylim", (0, data[strategies].max().max()))
    ax.set_ylim(*ylim)
    ax.set_yscale(diagram.get("yscale", "linear"))
    yticks = diagram.get("yticks", np.linspace(*ylim, 6).round())
    ax.set_yticks([],minor=True)
    ax.set_yticks(yticks)
    yformat = diagram["format"]
    ax.yaxis.set_major_formatter(plt.FuncFormatter(yformat))
    
    ax.set_ylabel(diagram.get("ylabel"))
    
    xlabel = diagram.get("xlabel")
    ax.set_xlabel(xlabel)
    
  
    
    if diagram.get("percent", False):
        ax2 = ax.twinx()
        ax2.set_ylabel("Percentage Left")
        pretty(ax2)
        ax2.spines['right'].set_visible(True)
        ax2.yaxis.set_major_formatter(matplotlib.ticker.PercentFormatter(1, 0))
    else: 
        ax2 = ax.twinx()
        pretty(ax2)
        ax2.spines['right'].set_visible(True)
        ax2.set_ylabel("Percentage Left")
        ax2.set_ylim(*ylim)
        ax2.set_yscale(diagram.get("yscale", "linear"))
        
        onehour = (diagram["data"]["items+logic"].loc[60])
        
        v = ax.hlines(onehour,(data.index * 60).min(), (data.index * 60).max())
        v.set_color("lightgray")
        v.set_linestyle(":")
 
        
        yticks, ytickslabels = zip(
            *diagram.get("yticks2",
                        [ (d, f"{1/d * 100:0.1f}%") for d in (diagram["best"][s] for s in strategies)
                        ] + [(onehour, f"{1/onehour * 100:0.1f}%")]
            ))
        ax2.set_yticks(yticks)
        ax2.set_yticklabels(ytickslabels)
        ax2.set_yticks([],minor=True)
        
        ax2.invert_yaxis()
        ax.invert_yaxis()
        
    xlim = (0, 60 * 60 * 2)
    ax.set_xlim(*xlim)
    ax.set_xticks(np.linspace(*xlim, 5))
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x/3600:0.0f}:{x%3600/60:02.0f}'))

    # if xlabel is None:
    #     ax.xaxis.tick_top()
    #     ax.spines["bottom"].set_visible(False)
    #     ax.spines["top"].set_visible(True)
    
   
axes[0][0].legend()
fig.tight_layout()
fig.subplots_adjust(hspace=0.20)
fig.savefig("by-time.eps")
    

In [None]:
xclasses = pd.read_csv("result/full/classes.csv")

In [None]:
a = xclasses.set_index(["name","predicate", "strategy"])

a["0"]

In [None]:
tclasses