This is a jupyter notebook. It contains our analysis of the results. You can look at the data below.
If you want to reproduce the data first delete the file `output.raw`. Then click above on `Cell` and then `Run All`. This will reevaluate all the code. Click on the Button `Gather Data` below. This will start our program and gather data with the default paramters and might run a long time. You can change the parameters via the forms above the `Gather Data` button. As soon as data is generated the plots below will update automatically.

If you know python and want to do additional analysis the data is in json format in `data.value` or in the `output.raw` file. Note that in the file there are missing `,` you need to add to form valid data. Call or see the `read("output.raw")` function.

In [36]:
import matplotlib.pyplot as plt
import json
from functional import seq
import itertools
import subprocess
import collections
import os.path
import ipywidgets
from ipywidgets import widgets
from IPython.display import HTML
from IPython.display import display

try: 
    from ipywidgets import widgets
    from progressbar import log_progress
    IPYWIDGETS = True
except ModuleNotFoundError:
    log_progress = lambda x: x
    IPYWIDGETS = False

In [37]:
data = widgets.Text(value="")

In [38]:
def sync_data():
    try:
        data.value = json.dumps(read("output.raw"))
    except Exception:
        pass

In [39]:
def read(path):
    with open(path, "r") as f:
        inp = f.read()
    inp = inp.replace("}\n{", "},\n{")
    inp = "[" + inp + "]"
    a = json.loads(inp)
    return a

In [40]:
def call(p, randomized_hashes, bucket_width, wauto):
    return ("java", "-classpath", ".:lib/commons-cli-1.4.jar", "KMeans", 
            "-testdata", "LSH-nmi-adapted.csv", 
            "-width", str(bucket_width),
            "-p", str(p),
            "-width_auto", str(wauto).lower(),
            "-r", str(randomized_hashes).lower())

In [41]:
def run_tries(tries=1, ps=(0,5,10), rs=(False, True), ws=(10, 33, 70), wautos=(False, True)):
    if not isinstance(ps, collections.Iterable):
        ps = [ps]
    if not isinstance(rs, collections.Iterable):
        rs = [rs]
    if not isinstance(ws, collections.Iterable):
        ws = [ws]
    if not isinstance(wautos, collections.Iterable):
        wautos = [wautos]
        
    product = list(itertools.product(ps, rs, ws, wautos, range(tries)))
    for p, r, w, wauto, _ in log_progress(product, every=1):
        process = subprocess.run(call(p, r, w, wauto), stdout=subprocess.PIPE)
        if process.returncode == 0:
            with open("output.raw", mode="ab") as f:
                f.write(process.stdout)
        else:
            print(process.stdout, file=sys.stderr)
        sync_data()

In [42]:
def boxplot(data, x, y):
    grouped = seq(data).group_by(lambda data: data[x]).sorted(lambda x: x[0])
    x_data = grouped.map(lambda data: data[0])
    y_data = grouped.map(lambda data: [line[y] for line in data[1]])
    #plt.plot(list(x_data), list(y_data))
    plt.boxplot(list(y_data))
    plt.xticks(list(range(len(list(y_data)))), list(x_data))

    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()

In [43]:
def line(data, x, y):

    def show_me(j):
        try:
            data = json.loads(j)

            grouped = seq(data).group_by(lambda data: data[x]).sorted(lambda x: x[0])
            x_data = grouped.map(lambda data: data[0])
            y_data_averaged = grouped.map(lambda data: sum([line[y] for line in data[1]]) / len(data[1]))
            x_data_flat = grouped.flat_map(lambda data: [data[0] for _ in data[1]])
            y_data_flat = grouped.flat_map(lambda data: [line[y] for line in data[1]])

            plt.plot(list(x_data), list(y_data_averaged))
            plt.plot(list(x_data_flat), list(y_data_flat), "o")

            plt.xlabel(x)
            plt.ylabel(y)
            plt.show()
        except json.JSONDecodeError:
            pass
    
    
    return ipywidgets.interactive_output(show_me, {"j":data})


In [44]:
if not os.path.isfile("output.raw"):
    if IPYWIDGETS:
        label = widgets.HTML("Couldn't find output, please gather some.")
        display(label)
    else:
        print("Couldn't find output. I'm now generating data, this might take a long time.")
else:
    if IPYWIDGETS:
        label = widgets.HTML("Data already found! You don't need to gather data to carry on.")
        data.value = json.dumps(read("output.raw"))
        display(label)
    else:
        print("Found data, carry on with analysis.")
    
if IPYWIDGETS:
    layouts = []
    default_layout = widgets.Layout()
    layouts.append(default_layout)
    
    button_layout = widgets.Layout(margin="20px 40% 20px 40%", padding="20px auto 20px auto", height="50px")
    layouts.append(button_layout)
    button = widgets.Button(description="Gather Data", layout=button_layout, button_style="danger")
    
    random_hashes_dict = {"Random Hashes": [True], "Canonical Projections": [False], "Both": [True, False]}
    random_hashes_widget = widgets.ToggleButtons(options=list(random_hashes_dict.keys()),
                                                      value="Both", layout=default_layout)

    ps_widget_layout = widgets.Layout(width="100%")
    layouts.append(ps_widget_layout)
    ps_widget = widgets.IntRangeSlider(value=[0,10], min=0, max=10, step=1, 
                                       layout=ps_widget_layout, description="Parameter P for similarity")
    
    display(random_hashes_widget)
    display(ps_widget)
    
    
    widthautodict = {"Buckets Auto Width": [True], "Buckets Manual Width": [False], "Both": [True, False]}
    widthauto = widgets.ToggleButtons(options=list(widthautodict.keys()), value="Both", layout=default_layout)
    display(widthauto)
    
    
    widths_widget_layout = widgets.Layout(width="100%")
    layouts.append(widths_widget_layout)
    widths_widget = widgets.IntRangeSlider(value=[10, 70], min=1, max=200, step=10,
                                           layout=widths_widget_layout, description="Bucket width")
    display(widths_widget)
    
    def disable_widths(*args):
        if widthauto.value == "Buckets Auto Width":
            widths_widget_layout.display = "none"
        else:
            widths_widget_layout.display = None
            
    widthauto.observe(disable_widths)
    
    display(button)
    def on_click(_):
        rs = random_hashes_dict[random_hashes_widget.value]
        wauto = widthautodict[widthauto.value]
        for l in layouts:
            l.display = "none"
        run_tries(rs=list(set(rs)),
                  ps=list(set(ps_widget.value)), 
                  ws=list(set(widths_widget.value)), 
                  wautos=list(set(wauto)))
        for l in layouts:
            l.display = None
        data_raw = read("output.raw")
        data.value = json.dumps(data_raw)
    button.on_click(on_click)
else:
    if not (os.path.isfile("output.raw")):
        run_tries()

In [45]:
try:
    table_content = ("<table><tr><th>Verfügbare Daten</th></tr><tr><td>" +
            "</td></tr><tr><td>".join(list(json.loads(data.value)[0].keys())) +
            "</td></tr></table")

    display(HTML(table_content))
except json.JSONDecodeError:
    pass

In [46]:
line(data, "randomizedHashes", "time")

In [47]:
line(data, "bucketWidthsAuto", "time")

In [48]:
hide_code_javascript = """<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
"""
display(HTML(hide_code_javascript))

In [49]:
line(data, "bucket width", "time")

In [50]:
line(data, "p", "time")

In [51]:
line(data, "p", "NMI")

In [52]:
line(data, "bucket width", "NMI")