This is the notebook, which contains the results of running our evaluation.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import scipy.stats as stats
from pathlib import Path

def pretty(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    
    for spine in [ax.spines['left'], ax.spines['bottom']]:
        spine.set_position(("outward", 5))
        spine.set_color("gray")
        
    for axis in [ax.yaxis, ax.xaxis]:
        for x in axis.get_major_ticks():
            x.label1.set_color("gray")
            x.tick1line.set_color("gray")


# Full

This section covers the evaluation where we preserve the full bug. We start by loading the the data and indexing by `name`, `predicate`, and `strategy`. The data have been computed and put in `results/result.csv` by our evalutation framework.

In [None]:
results = pd.read_csv("result/full/result.csv").set_index(["name", "predicate","strategy"])


A single line of our data looks like this, we store the follwing data: 

*  `bugs` which contain the number of lines in the cleaned up bug-report

*  `initial-scc` and `scc` contain the number of strongly connected components before and after reduction,

*  `initial-classes` and `classes` contain the number of classes before and after reduction,

*  `initial-bytes` and `bytes` contain the number of bytes before and after reduction,

*  `iters` which contain the number of invocations of the predicate, 

*  `searches` the number of binary searches made by algorithm

*  `time` which records the time to reach the final successfull solution,

*  `status` which records whether the reduction completed correctly,

*  `verify` which records information about if bug is preserved.

Here is an example:


In [None]:
results.loc["url0067cdd33d_goldolphin_Mi", "cfr", "classes"]

In [None]:
pretty(results.bugs.plot.hist())

We are testing four startegies: 

- classes
- logic+approx
- logic+graph
- logic


In [None]:
strategies = list(reversed(["classes", "logic+ddmin", "logic+ddmin+rev", "logic+approx", "logic+approx+rev", "logic"]))
colors = {"initial": "gray", 
          "classes" : "red", 
          "logic+approx" : "orange", "logic+approx+rev" : "gold", 
          "logic+graph" : "orange", 
          "logic+ddmin" : "green", "logic+ddmin+rev" : "lightgreen", 
          "logic": "blue"}


## Sanity Checks

Before we go on to evaluate the code we check that the system is working correctly. First we check that the status is "success". We find the following distribution of statuses:


In [None]:
fig, ax = plt.subplots(1, figsize=(14,2))

timeouts = (results.status == "timeout").groupby("strategy").mean()

ax.set_xlim(0, 100)

pretty(ax)
x = ax.barh(
        strategies, 
        [timeouts[s] * 100 for s in strategies], 
        color=[colors[s] for s in strategies]
    )

The following is a list of all the experiments that failed:

In [None]:
m = 10000000
for i in results.index:
    (b, p, x) = i
    if x != "classes": continue
    if results.classes.loc[(b, p, x)] < results.classes.loc[(b, p, "logic")]:
        if results["initial-bytes"][i] < m:
            m = results["initial-bytes"][i]
            print('/'.join(i), results["initial-bytes"][i], results.bytes[i], results.bytes[(b, p, "logic")])

## Comparative reduction

In our comparative reducetion results we will update the times of all timeout items.

In [None]:
TIMEOUT = 3600
full = results.copy()
full.time.loc[full.status == "timeout"] = TIMEOUT

In our first experiment we are going to look at comparative final size, and time. We use the geometric mean, so that we can compare the results:

In [None]:
keyvalues = full.filter(["bytes", "classes", "time"], axis=1)\
    .unstack("strategy")\
    .agg(stats.gmean)\
    .unstack()

v = full.filter(["initial-bytes", "initial-classes"], axis=1).unstack("strategy")\
    .agg(stats.gmean)\
    .unstack()["classes"]\
    .rename(lambda a: a.lstrip("initial-"))

In [None]:
keyvalues.round(1)[list(reversed(strategies))]

In [None]:
(keyvalues.loc[["bytes","classes"]].div(v, axis='rows') * 100).round(1)[list(reversed(strategies))]

### Graphical Results

In [None]:
def draw_diagram(full):
    fig, axes = plt.subplots(1, 4, figsize=(12,4), sharey=True)

    for lb, ax in zip(["time", "iters", "classes", "bytes"], axes.flatten()):
        maxx, minx = 0, 1000000000
       
        x = full[lb].unstack("strategy")
        if lb != "time" and lb != "iters":
            x = x / full["initial-" + lb].unstack("strategy")
        for key in strategies:
            if lb == "time":
                data = list(sorted(d for d in x[key] if d < TIMEOUT))
                data.append(TIMEOUT)
            else:
                data = sorted(x[key])
            
            ax.plot(data, [i + 1 for i,_ in enumerate(data)], label=key, color=colors[key])
            maxx = max(maxx, max(x[key]))
            minx = min(minx, min(x[key]))
            
           
        # if lb == "bytes" or lb == "classes":
        #     x = full["initial-" + lb].unstack("strategy")
        #     ax.plot(sorted(x["classes"]), 
        #             [i + 1 for i,_ in enumerate(x["classes"])],
        #             label="initial", color=colors["initial"])
        #     maxx = max(maxx, max(x["classes"]))
        #     minx = min(minx, min(x["classes"]))
        # else:
        #     ax.plot([], [], label="initial", color=colors["initial"])
            
        minx = max(1, minx)
        
        ylim = 0, len(x[key])
        ax.set_yticks(np.linspace(*ylim, 7))
        ax.set_ylim(*ylim)
        
        if lb == "time":
            xlim = 0, 3600
            ax.set_xticks(np.linspace(*xlim, 5))
        elif lb == "iters":
            xlim = 0, maxx
            l = np.linspace(*xlim, 5)
            ax.set_xticks(l)
        else:
            xlim = 0,1
            #ax.set_xscale("log")
            # print(lb, xlim)
            l = np.logspace(math.log(minx,2), math.log(maxx,2), 5, base=2)
            l = np.linspace(*xlim, 5)
            ax.set_xticks(l)
        ax.set_xlim(*xlim)
        
        #if lb == "bytes":
        #    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x/1000:0.0f} KB'))
        if lb == "time":
            ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x:0.0f} s'))
        elif lb == "iters":
            ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x:0.0f}'))
        else:
            ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{100 * x:0.0f}%'))
        v = ax.hlines(102,*xlim)
        v.set_color("lightgray")
        v.set_linestyle(":")
        
        ax.set_xlabel(lb)    
        pretty(ax)
    
    axes[3].legend()
    fig.tight_layout()
    
    return fig

fig = draw_diagram(full)
fig.savefig("new-graph.pdf")

The graphs are formatted like the previous article: In the top row we have number programs that complete before a certain time and iterations. In the bottom row we have the number of programs that have been reduced to a size below a certian number of bytes or classes.


In [None]:
from pathlib import Path


glob = Path("result/full/").glob("url*/*")
dfCs = []
dfBs = []
j = 0 
for b in glob:
    if not b.name in {"cfr", "procyon", "fernflower"}: continue
    j += 1
    #if j > 10: break
    try:
        dfC = pd.DataFrame()
        dfB = pd.DataFrame()
        metrics = list(b.glob("*/workfolder/metrics.csv"))
        if not metrics: continue
        for i in metrics:
            strat, *_ = i.relative_to(b).parts
            # print(i)
            m = pd.read_csv(i)
            m.time = m.time.floordiv(60) + 1
            x = m[m.judgment == "success"]\
                .groupby("time")[["classes", "bytes"]]\
                .min()\
                .div(m.iloc[0][["classes", "bytes"]])\
                .reindex(index=range(0, 61))\
                .expanding().min()\
                .fillna(1)
        
            dfC = dfC.assign(**{strat: x["classes"]})
            dfB = dfB.assign(**{strat: x["bytes"]})
        
        dfCs.append(dfC)
        dfBs.append(dfB)
    except:
        print("WARNING", i)
        continue

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(11,6), sharey=True, sharex=True)

graphs = [
    ("Mean Percentage of Classes Left", dfCs, lambda x: x.mean()), 
    ("Mean Percentage of Bytes Left", dfBs, lambda x: x.mean()),
    ("Median Percentage of Classes Left", dfCs, lambda x: x.median()), 
    ("Median Percentage of Bytes Left", dfBs, lambda x: x.median()),  
]

for ax, (title, dfc, fn) in zip(axes.flatten(), graphs):
    m = fn(pd.concat(dfc, keys=range(0, len(dfc))).groupby("time"))
    for s in strategies:
        ax.plot(m.index * 60, m[s], label=s, color=colors[s])
        
        v = ax.hlines(min(m[s]),0, 3600)
        v.set_color("lightgray")
        v.set_linestyle(":")
        
    ax.set_ylim(0,1)
    ax.set_xlim(0,3600)
    ax.set_xticks(np.linspace(0,3600, 7))
    ax.set_title(title)
    
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x*100:0.0f}%'))
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x:0.0f} s'))
    
    pretty(ax)

ax.legend()
fig.tight_layout()
fig.savefig("new-approach.pdf")
    

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(11,6), sharey=True, sharex=True)

graphs = [
    ("Mean Percentage of Classes Left", dfCs, lambda x: x.mean()), 
    ("Mean Percentage of Bytes Left", dfBs, lambda x: x.mean()),
    ("Median Percentage of Classes Left", dfCs, lambda x: x.median()), 
    ("Median Percentage of Bytes Left", dfBs, lambda x: x.median()),  
]

for ax, (title, dfc, fn) in zip(axes.flatten(), graphs):
    m = fn(pd.concat(dfc, keys=range(0, len(dfc))).groupby("time"))
    for s in strategies:
        ax.plot(m.index * 60, m[s], label=s, color=colors[s])
        
        v = ax.hlines(min(m[s]),0, 3600)
        v.set_color("lightgray")
        v.set_linestyle(":")
        
    ax.set_ylim(0,1)
    ax.set_xlim(0,3600)
    ax.set_xticks(np.linspace(0,3600, 7))
    ax.set_title(title)
    
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x*100:0.0f}%'))
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x:0.0f} s'))
    
    pretty(ax)

ax.legend()
fig.tight_layout()
fig.savefig("new-approach.pdf")
    

## Extra evaluation

Here i have left some space for some extra interesting questions: 

The first question is how the size of the input in bytes affect the time to setup and run the predicate. In this case it is fernflower.

The interesting thing here is that the execution time of the predicate is dependent on the size of the input, and by testing small inputs it can be up to 10 times faster than testing the large inputs.

In [None]:
try:
    metrics = pd.read_csv("result/full/url0e7ea11f42_rbouckaert_DensiTree/fernflower/logic/workfolder/metrics.csv")

    fig, axes = plt.subplots(2, 1, figsize=(7,7), sharex=True)


    for key, ax in zip(["setup time", "run time"], axes):
        ax.scatter(metrics.bytes, metrics[key])
        ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x:0.1f} s'))
        ax.set_xlim(0, metrics.bytes.max() * 1.1)
        ax.set_ylim(0, metrics[key].max() * 1.1)
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.set_title(key)
        pretty(ax)
        
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x/1000:0.0f} Kb'))
    
    fig.tight_layout()
except:
    pass

# Part

Here we analyse given only 1 bug being preserved by Javac

In [None]:
part_results = pd.read_csv("result/part/result.csv").set_index(["name", "predicate","strategy"])

In [None]:
fig, ax = plt.subplots(1, figsize=(14,2))

timeouts = (part_results.status == "timeout").groupby("strategy").mean()

ax.set_xlim(0, 100)

pretty(ax)
x = ax.barh(
        strategies, 
        [timeouts[s] * 100 for s in strategies], 
        color=[colors[s] for s in strategies]
    )

## Comparative reduction

In our first experiment we are going to look at comparative final size, and time. We use the geometric mean, so that we can compare the results:

In [None]:
TIMEOUT = 3600
part = part_results.copy()
part.time.loc[part.status == "timeout"] = TIMEOUT

In [None]:
keyvalues = part.filter(["bytes", "classes", "time"], axis=1)\
    .unstack("strategy")\
    .agg(stats.gmean)\
    .unstack()

v = part.filter(["initial-bytes", "initial-classes"], axis=1).unstack("strategy")\
    .agg(stats.gmean)\
    .unstack()["classes"]\
    .rename(lambda a: a.lstrip("initial-"))

Geometric Averages

In [None]:
keyvalues.round(1)[list(reversed(strategies))]

We can compare them on how much reduction each of them have made.

In [None]:
(keyvalues.loc[["bytes","classes"]].div(v, axis='rows') * 100).round(1)[list(reversed(strategies))]

### Graphical Results

In [None]:
fig = draw_diagram(part)