In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from shutil import copyfile
from benchmark import pickler

In [None]:
data = pickler.load_from_notebooks()

In [None]:
tidre = data['tidre']
timestamp = data['timestamp']

print("Run using tidre: ", tidre)
print("Data generated on", timestamp)

# Determing output path for generated plots
out_dir = './plots/'
if tidre:
    out_dir += 'tidre/'
else:
    out_dir += 're2/'
out_dir += timestamp

# Create the directory if it does not yet exist
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Copy the data file to the out dir as a backup
copyfile('./data.pickle', out_dir + '/data.pickle')

In [None]:
# Helper function to make all plots in the notebook
def make_bar_plot(x_values, series, plot_title, axes_titles, log_scale, rotate_x_labels, plot_name):

    plot_data = []
    for (i, serie) in enumerate(series):
        plot_data.append(serie)

    # set width of bar
    bar_width = 0.2

    r = []
    r.append(np.arange(len(plot_data[0]['data'])))
    for i in range(1, len(plot_data)):
        r.append([x + bar_width for x in r[i-1]])

    fig = plt.figure(figsize=(9,7))
    fig.patch.set_facecolor('white')

    colors = ['b', 'r', 'g', 'y', 'o']
    for (i, dat) in enumerate(plot_data):
        plt.bar(r[i], plot_data[i]['data'], color=colors[i % len(colors)], width=bar_width, edgecolor='white', label=plot_data[i]['label'], zorder=3)

    # Add xticks on the middle of the group bars
    plt.xlabel(axes_titles['x'])
    plt.ylabel(axes_titles['y'])

    plt.xticks([r + (bar_width * (len(plot_data) - 1) / 2) for r in range(len(plot_data[0]['data']))], x_values)

    if rotate_x_labels:
        plt.xticks(rotation=-90)

    plt.title(plot_title)

    axes = plt.gca()
    axes.grid(which='both', axis='y', linestyle='--')

    if log_scale:
        plt.yscale('log')

    # Create legend & Show graphic
    plt.legend()

    # Save fig as pdf
    plt.savefig(out_dir + '/' + plot_name + '.png')

    plt.show()

In [None]:
def make_line_plot(x_values, series, plot_title, axes_titles, log_scale, rotate_x_labels, plot_name, horizontal_line=None, line_name=None):

    plot_data = []
    for (i, serie) in enumerate(series):
        plot_data.append(serie)

    fig = plt.figure(figsize=(9,7))
    fig.patch.set_facecolor('white')

    if horizontal_line is not None:
        plt.axhline(y=horizontal_line, color='k', linestyle='--', label=line_name)

    colors = ['b', 'r', 'g', 'y', 'o']
    markers = ['o', 'x', '+', '>', '<']
    for (i, dat) in enumerate(plot_data):
        plt.plot(x_values, plot_data[i]['data'], color=colors[i % len(colors)], marker=markers[i % len(markers)], label=plot_data[i]['label'], zorder=3)

    # Add xticks on the middle of the group bars
    plt.xlabel(axes_titles['x'])
    plt.ylabel(axes_titles['y'])

    plt.xticks(x_values)
    if rotate_x_labels:
        plt.xticks(rotation=-90)

    plt.title(plot_title)

    axes = plt.gca()
    axes.grid(which='both', axis='y', linestyle='--')

    if log_scale:
        plt.yscale('log')

    # Create legend & Show graphic
    plt.legend()

    # Save fig as pdf
    plt.savefig(out_dir + '/' + plot_name + '.png')

    plt.show()

In [None]:
tidre_single_record = 0.00024106502532958985
tidre_10M = (10e6 * 100) / 0.28932828903198243

In [None]:
x_vals = np.array(list(data['in_size']['vanilla_filter'].keys()))
plot_data = [
    {
        'data': np.array(list(data['in_size']['vanilla_filter'].values())),
        'label': 'vanilla dask'
    },
    {
        'data': np.array(list(data['in_size']['re2_filter'].values())),
        'label': 'dask + re2'
    },
    {
        'data': np.array(list(data['in_size']['tidre_filter_unaligned'].values())),
        'label': 'dask + tidre (unaligned)'
    },
    {
        'data': np.array(list(data['in_size']['tidre_filter'].values())),
        'label': 'dask + tidre'
    },
]

make_bar_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op runtime vs input sizes - 1M batch size',    # Plot title
    {                                   # Axes titles
        'x': 'Number of records',
        'y': 'Runtime (seconds)'
    },
    False,                              # Plot y axis in log scale
    True,                               # Rotate x labels
    'bar_in_sizes'                      # Plot name
)

make_bar_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op runtime vs input sizes (log) - 1M batch size',    # Plot title
    {                                   # Axes titles
        'x': 'Number of records',
        'y': 'Runtime (seconds)'
    },
    True,                              # Plot y axis in log scale
    True,                               # Rotate x labels
    'bar_in_sizes_log'                      # Plot name
)

make_line_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op runtime vs input sizes - 1M batch size',    # Plot title
    {                                   # Axes titles
        'x': 'Number of records',
        'y': 'Runtime (seconds)'
    },
    False,                               # Plot y axis in log scale
    True,                               # Rotate x labels
    'line_in_sizes',                 # Plot name
    tidre_single_record,
    'tidre single record'
)

make_line_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op runtime vs input sizes (log) - 1M batch size',    # Plot title
    {                                   # Axes titles
        'x': 'Number of records',
        'y': 'Runtime (seconds)'
    },
    True,                               # Plot y axis in log scale
    True,                               # Rotate x labels
    'line_in_sizes_log',                 # Plot name
    tidre_single_record,
    'tidre single record'
)

In [None]:
# THROUGHPUT

x_vals = np.array(list(data['in_size']['vanilla_filter'].keys()))
in_sizes = [1e3, 2e3, 4e3, 8e3, 16e3, 32e3, 64e3, 128e3, 256e3, 512e3, 1024e3, 2048e3, 4096e3]
in_bytes = np.array([x * 100 * 1 for x in in_sizes])
plot_data = [
    {
        'data': np.divide(in_bytes, np.array(list(data['in_size']['vanilla_filter'].values()))),
        'label': 'vanilla dask'
    },
    {
        'data': np.divide(in_bytes, np.array(list(data['in_size']['re2_filter'].values()))),
        'label': 'dask + re2'
    },
    {
        'data': np.divide(in_bytes, np.array(list(data['in_size']['tidre_filter_unaligned'].values()))),
        'label': 'dask + tidre (unaligned)'
    },
    {
        'data': np.divide(in_bytes, np.array(list(data['in_size']['tidre_filter'].values()))),
        'label': 'dask + tidre'
    },
]

make_bar_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op throughput vs input sizes - 1M batch size',    # Plot title
    {                                   # Axes titles
        'x': 'Number of records',
        'y': 'Throughput (bytes/s)'
    },
    False,                              # Plot y axis in log scale
    True,                               # Rotate x labels
    'bar_throughput_in_sizes'                      # Plot name
)

make_bar_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op throughput vs input sizes (log) - 1M batch size',    # Plot title
    {                                   # Axes titles
        'x': 'Number of records',
        'y': 'Throughput (bytes/s)'
    },
    True,                              # Plot y axis in log scale
    True,                               # Rotate x labels
    'bar_throughput_in_sizes_log'                      # Plot name
)

make_line_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op throughput vs input sizes - 1M batch size',    # Plot title
    {                                   # Axes titles
        'x': 'Number of records',
        'y': 'Throughput (bytes/s)'
    },
    False,                               # Plot y axis in log scale
    True,                               # Rotate x labels
    'line_throughput_in_sizes',                 # Plot name
    tidre_10M,
    'tidre 10M records, single batch'
)

make_line_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op throughput vs input sizes (log) - 1M batch size',    # Plot title
    {                                   # Axes titles
        'x': 'Number of records',
        'y': 'Throughput (bytes/s)'
    },
    True,                               # Plot y axis in log scale
    True,                               # Rotate x labels
    'line_throughput_in_sizes_log',                 # Plot name
    tidre_10M,
    'tidre 10M records, single batch'
)

In [None]:
x_vals = np.array(list(data['batch_size']['vanilla_filter'].keys()))
plot_data = [
    {
        'data': np.array(list(data['batch_size']['vanilla_filter'].values())),
        'label': 'vanilla dask'
    },
    {
        'data': np.array(list(data['batch_size']['re2_filter'].values())),
        'label': 'dask + re2'
    },
    {
        'data': np.array(list(data['batch_size']['tidre_filter_unaligned'].values())),
        'label': 'dask + tidre (unaligned)'
    },
    {
        'data': np.array(list(data['batch_size']['tidre_filter'].values())),
        'label': 'dask + tidre'
    },
]

make_bar_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op runtime vs batch sizes - 4M records',    # Plot title
    {                                   # Axes titles
        'x': 'Batch size (records)',
        'y': 'Runtime (seconds)'
    },
    False,                              # Plot y axis in log scale
    True,                               # Rotate x labels
    'bar_batch_sizes'                      # Plot name
)

make_bar_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op runtime vs batch sizes (log) - 4M records',    # Plot title
    {                                   # Axes titles
        'x': 'Batch size (records)',
        'y': 'Runtime (seconds)'
    },
    True,                              # Plot y axis in log scale
    True,                               # Rotate x labels
    'bar_batch_sizes_log'                      # Plot name
)

make_line_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op runtime vs batch sizes - 4M records',    # Plot title
    {                                   # Axes titles
        'x': 'Batch size (records)',
        'y': 'Runtime (seconds)'
    },
    False,                               # Plot y axis in log scale
    True,                               # Rotate x labels
    'line_batch_sizes',                 # Plot name
    tidre_single_record,
    'tidre single record'
)

make_line_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op runtime vs batch sizes (log) - 4M records',    # Plot title
    {                                   # Axes titles
        'x': 'Batch size (records)',
        'y': 'Runtime (seconds)'
    },
    True,                               # Plot y axis in log scale
    True,                               # Rotate x labels
    'line_batch_sizes_log',                 # Plot name
    tidre_single_record,
    'tidre single record'
)

In [None]:
# THROUGHPUT

x_vals = np.array(list(data['batch_size']['vanilla_filter'].keys()))
in_bytes = 4096e3 * 100 * 1
plot_data = [
    {
        'data': np.divide(in_bytes, np.array(list(data['batch_size']['vanilla_filter'].values()))),
        'label': 'vanilla dask'
    },
    {
        'data': np.divide(in_bytes, np.array(list(data['batch_size']['re2_filter'].values()))),
        'label': 'dask + re2'
    },
    {
        'data': np.divide(in_bytes, np.array(list(data['batch_size']['tidre_filter_unaligned'].values()))),
        'label': 'dask + tidre (unaligned)'
    },
    {
        'data': np.divide(in_bytes, np.array(list(data['batch_size']['tidre_filter'].values()))),
        'label': 'dask + tidre'
    },
]

make_bar_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op throughput vs batch sizes - 4M records',    # Plot title
    {                                   # Axes titles
        'x': 'Batch size (records)',
        'y': 'Throughput (bytes/s)'
    },
    False,                              # Plot y axis in log scale
    True,                               # Rotate x labels
    'bar_throughput_batch_sizes'                      # Plot name
)

make_bar_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op throughput vs batch sizes (log) - 4M records',    # Plot title
    {                                   # Axes titles
        'x': 'Batch size (records)',
        'y': 'Throughput (bytes/s)'
    },
    True,                              # Plot y axis in log scale
    True,                               # Rotate x labels
    'bar_throughput_batch_sizes_log'                      # Plot name
)

make_line_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op throughput vs batch sizes - 4M records',    # Plot title
    {                                   # Axes titles
        'x': 'Batch size (records)',
        'y': 'Throughput (bytes/s)'
    },
    False,                               # Plot y axis in log scale
    True,                               # Rotate x labels
    'line_throughput_batch_sizes',                 # Plot name
    tidre_10M,
    'tidre 10M records, single batch'
)

make_line_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total filter op throughput vs batch sizes (log) - 4M records',    # Plot title
    {                                   # Axes titles
        'x': 'Batch size (records)',
        'y': 'Throughput (bytes/s)'
    },
    True,                               # Plot y axis in log scale
    True,                               # Rotate x labels
    'line_throughput_batch_sizes_log',                 # Plot name
    tidre_10M,
    'tidre 10M records, single batch'
)

In [None]:
x_vals = np.array(list(data['batch_size']['vanilla_filter'].keys()))
plot_data = [
    {
        'data': np.array(list(data['batch_size']['tidre_filter_unaligned'].values())),
        'label': 'dask + tidre (unaligned)'
    },
    {
        'data': np.array(list(data['batch_size']['tidre_filter'].values())),
        'label': 'dask + tidre'
    },
]

make_line_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total Tidre filter op runtime vs batch sizes - 4M records',    # Plot title
    {                                   # Axes titles
        'x': 'Batch size (records)',
        'y': 'Runtime (seconds)'
    },
    False,                              # Plot y axis in log scale
    True,                               # Rotate x labels
    'line_batch_sizes_tidre_only',      # Plot name
    tidre_single_record,
    'tidre single record'
)

In [None]:
# THROUGHPUT

x_vals = np.array(list(data['batch_size']['vanilla_filter'].keys()))
in_bytes = 4096e3 * 100 * 1
plot_data = [
    {
        'data': np.divide(in_bytes, np.array(list(data['batch_size']['tidre_filter_unaligned'].values()))),
        'label': 'dask + tidre (unaligned)'
    },
    {
        'data': np.divide(in_bytes, np.array(list(data['batch_size']['tidre_filter'].values()))),
        'label': 'dask + tidre'
    },
]

make_line_plot(
    x_vals,
    plot_data,                              # Whether or not tidre is benchmarked
    'Total Tidre filter op throughput vs batch sizes - 4M records',    # Plot title
    {                                   # Axes titles
        'x': 'Batch size (records)',
        'y': 'Throughput (bytes/s)'
    },
    False,                              # Plot y axis in log scale
    True,                               # Rotate x labels
    'line_throughput_batch_sizes_tidre_only',      # Plot name
    tidre_10M,
    'tidre 10M records, single batch'
)