This is the notebook, which contains the results of running our evaluation.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from pathlib import Path

def investigate(name, predicate, strategy):
    folder = Path("result") / name / predicate / strategy / "workfolder"
    final = folder / "final/sandbox" / predicate 
    print((final / "compiler.out.txt").read_text())
    for f in final.glob("src/**/*.java"):
        print(f)
        print(f.read_text())
        
    print("=======")
    
def check_verify(name, predicate, strategy):
    folder = Path("result") / name / predicate / strategy / "workfolder"
    v = results.loc[(name, predicate, strategy)].verify
    if v != "success":
        a = set((folder / "reduction" / v / "stdout").read_text().splitlines())
        b = set((folder / "initial" / "stdout").read_text().splitlines())
        
        print (a - b)

# Full

This section covers the evaluation where we preserve the full bug. We start by loading the the data and indexing by `name`, `predicate`, and `strategy`. The data have been computed and put in `results/result.csv` by our evalutation framework.

In [None]:
results = pd.read_csv("result/result.csv").set_index(["name", "predicate","strategy"])


A single line of our data looks like this, we store the follwing data: 

*  `bugs` which contain the number of lines in the cleaned up bug-report

*  `initial-scc` and `scc` contain the number of strongly connected components before and after reduction,

*  `initial-classes` and `classes` contain the number of classes before and after reduction,

*  `initial-bytes` and `bytes` contain the number of bytes before and after reduction,

*  `iters` which contain the number of invocations of the predicate, 

*  `searches` the number of binary searches made by algorithm

*  `time` which records the time to reach the final successfull solution,

*  `status` which records whether the reduction completed correctly,

*  `verify` which records information about if bug is preserved.


In [None]:
results.loc["url0067cdd33d_goldolphin_Mi", "cfr", "classes"]


## Sanity Checks

Before we go on to evaluate the code we check that the system is working correctly. First we check that the status is "success". We find the following distribution of statuses:


In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15,5))
    
x = results.status.value_counts().plot.pie(ax=axes[0])
x = results.verify.value_counts().plot.pie(ax=axes[1])
x = results.flaky.value_counts().plot.pie(ax=axes[2])

fig.tight_layout()

The following is a list of all the experiments that failed:

In [None]:
for i in results[results.status != "success"].index:
    print('/'.join(i), results.loc[i].status, results.bugs[i], results.searches[i], results.iters[i])

We also want to make sure that we do not use more searches than bugs, as this probably means that we have an incomplete description of java.

In [None]:
for i in results[results.searches > results.bugs].index:
    if i[2] != "logic+extends+over": continue 
    print('/'.join(i), int(results.bugs[i]), int(results.searches[i]), results.classes[i], results.verify[i])

In [None]:
for i in results[results.verify != "success"].index:
    if i[2] != "logic+extends+over": continue 
    print('/'.join(i), int(results.bugs[i]), results.verify[i], int(results.searches[i]), results.classes[i])

From now on we only work with data where all evaluation techniques succeed:

In [None]:
success = results.status.apply(lambda x: x == "success").groupby(["name", "predicate"]).all()
sucessfull = results[success[results.index]]

## Comparative reduction

In our first experiment we are going to look at comparative final size, and time. We use the geometric mean, so that we can compare the results:

In [None]:
keyvalues = sucessfull.filter(["bytes", "classes", "time"], axis=1)\
    .unstack("strategy")\
    .agg(stats.gmean)\
    .unstack()

print("Geomertric averages:")
print(keyvalues.round(1))

print("\nRelative to classes:")
print(keyvalues.div(keyvalues['classes'], axis='rows').round(2))


We can see that on average the over apporixmation and under-aproximation performs 6 times better on number of bytes and 2 times better on number of classes, it does however take 3-4 times longer.

We can get more detailed information by inspecting the graphs for runtimes.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(9,7), sharey=True)

for lb, ax in zip(["time", "iters", "bytes", "classes"], axes.flatten()):
    maxx = 0
    x = results[success[results.index]][lb].unstack("strategy")
    for key in ["logic+over", "logic+extends+over", "classes"]:
        ax.plot(sorted(x[key]), [i + 1 for i,_ in enumerate(x[key])], label=key)
        maxx = max(maxx, max(x[key]))
        
    ylim = 1, len(x[key])
    ax.set_yticks(np.linspace(*ylim, 6))
    ax.set_ylim(*ylim)
    
    if lb == "time":
        xlim = 0, 3200
    else:
        xlim = 0, maxx
    ax.set_xticks(np.linspace(*xlim, 5))
    ax.set_xlim(*xlim)
    
    if lb == "bytes":
        ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x/1000:0.0f} Kb'))
    elif lb == "time" :
        ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x:0.0f} s'))
    else:
        ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x:0.0f}'))
    
    ax.set_xlabel(lb)    
    
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    
    for spine in [ax.spines['left'], ax.spines['bottom']]:
        spine.set_position(("outward", 5))
        spine.set_color("gray")
        
    for axis in [ax.yaxis, ax.xaxis]:
        for x in axis.get_major_ticks():
            x.label1.set_color("gray")
            x.tick1line.set_color("gray")

ax.legend()
fig.tight_layout()

The graphs are formatted like the previous article: In the top row we have number programs that complete before a certain time and iterations. In the bottom row we have the number of programs that have been reduced to a size below a certian number of bytes or classes.


## Extra evaluation

Here i have left some space for some extra interesting questions: 

The first question is how the size of the input in bytes affect the time to setup and run the predicate. In this case it is fernflower.

The interesting thing here is that the execution time of the predicate is dependent on the size of the input, and by testing small inputs it can be up to 10 times faster than testing the large inputs.

In [None]:
metrics = pd.read_csv("result/url0e7ea11f42_rbouckaert_DensiTree/fernflower/logic+over/workfolder/metrics.csv")

fig, axes = plt.subplots(2, 1, figsize=(7,7), sharex=True)


for key, ax in zip(["setup time", "run time"], axes):
    ax.scatter(metrics.bytes, metrics[key])
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x:0.1f} s'))
    ax.set_xlim(0, metrics.bytes.max() * 1.1)
    ax.set_ylim(0, metrics[key].max() * 1.1)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.set_title(key)
    for spine in [ax.spines['left'], ax.spines['bottom']]:
        spine.set_position(("outward", 5))
        spine.set_color("gray")
        
    for axis in [ax.yaxis, ax.xaxis]:
        for x in axis.get_major_ticks():
            x.label1.set_color("gray")
            x.tick1line.set_color("gray")
    
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f'{x/1000:0.0f} Kb'))

fig.tight_layout()

# Appendix. Here Be Dragons!

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14,7), sharey=True)

metrics = [ "bytes", "classes" ] 

labels = [f"{a}:{b}" for a, b in sucessfull.unstack("strategy").index]
x = np.arange(len(labels))
total_width = 0.75
  
for metric, ax in zip(metrics, list(axes)): 
    m = sucessfull[metric].unstack("strategy")
    rest = 1 / m.drop(["classes"], axis=1).div(m.classes, axis=0, level=0)

    width = total_width / len(rest.columns)
   
    minx, maxx = 1, 0
    for n, i in enumerate(rest.columns):
        offset = x - (total_width/2 - width*n - width/2)
        ax.barh(offset,
                [ 1 - x if x < 1 else x - 1 for x in rest[i] ], 
                height=width * 0.75, 
                left=[min(x, 1) for x in rest[i]], 
                label=i)
        maxx = max(maxx, rest[i].max())

        
    ax.set_ylim(-total_width, len(labels) -1 + total_width)
    ax.set_xlabel(metric)
    ax.set_xlim(minx/2,maxx*2)
    ax.set_xscale("log")
    #ax.set_xticks([1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1,2, 4, 8])
    ax.xaxis.set_major_locator(plt.LogLocator(10))
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, y: f"{1/x}x")) #plt.LogFormatterMathtext(2))
    #ax.set_xticklabels([1/8, 1/4, 1/2,1,2, 4, 8])
   
    ax.axvline(1, ls='-', color='lightgray', lw=1)

    for spine in ax.spines.values():
        spine.set_visible(False)

    ax.spines["left"].set_position(("outward", 5))
    ax.spines["bottom"].set_position(("outward", 5))

ax.set_yticks(x)
ax.set_yticklabels(labels)
    
ax.legend()
fig.tight_layout()